Next changeset 1:717aee069681 (2014-11-17) |
Commit message:
Imported from capsule None |
added:
fasta_concatenate_by_species.py fasta_concatenate_by_species.xml test-data/cf_maf2fasta.dat test-data/fasta_concatenate_out.fasta utils/__init__.py utils/maf_utilities.py utils/odict.py |
b |
diff -r 000000000000 -r 2126e1b833a2 fasta_concatenate_by_species.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_concatenate_by_species.py Mon May 19 12:33:30 2014 -0400 |
[ |
@@ -0,0 +1,39 @@ +#!/usr/bin/env python +#Dan Blankenberg +""" +Takes a Multiple Alignment FASTA file and concatenates +sequences for each species, resulting in one sequence +alignment per species. +""" + +import sys, tempfile +from utils.maf_utilities import iter_fasta_alignment +from utils.odict import odict + +def __main__(): + input_filename = sys.argv[1] + output_filename = sys.argv[2] + species = odict() + cur_size = 0 + for components in iter_fasta_alignment( input_filename ): + species_not_written = species.keys() + for component in components: + if component.species not in species: + species[component.species] = tempfile.TemporaryFile() + species[component.species].write( "-" * cur_size ) + species[component.species].write( component.text ) + try: + species_not_written.remove( component.species ) + except ValueError: + #this is a new species + pass + for spec in species_not_written: + species[spec].write( "-" * len( components[0].text ) ) + cur_size += len( components[0].text ) + out = open( output_filename, 'wb' ) + for spec, f in species.iteritems(): + f.seek( 0 ) + out.write( ">%s\n%s\n" % ( spec, f.read() ) ) + out.close() + +if __name__ == "__main__" : __main__() |
b |
diff -r 000000000000 -r 2126e1b833a2 fasta_concatenate_by_species.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_concatenate_by_species.xml Mon May 19 12:33:30 2014 -0400 |
b |
@@ -0,0 +1,72 @@ +<tool id="fasta_concatenate0" name="Concatenate" version="0.0.0"> + <description>FASTA alignment by species</description> + <command interpreter="python">fasta_concatenate_by_species.py $input1 $out_file1</command> + <inputs> + <param name="input1" type="data" format="fasta" label="FASTA alignment"/> + </inputs> + <outputs> + <data name="out_file1" format="fasta"/> + </outputs> + <tests> + <test> + <param name="input1" value="cf_maf2fasta.dat" /> + <output name="out_file1" file="fasta_concatenate_out.fasta" /> + </test> + </tests> + <help> + +**What it does** + +This tools attempts to parse FASTA headers to determine the species for each sequence in a multiple FASTA alignment. +It then linearly concatenates the sequences for each species in the file, creating one sequence per determined species. + +------- + +**Example** + +Starting FASTA:: + + >hg18.chr1(+):10016339-10016341|hg18_0 + GT + >panTro2.chr1(+):10195380-10195382|panTro2_0 + GT + >rheMac2.chr1(+):13119747-13119749|rheMac2_0 + GT + >mm8.chr4(-):148269679-148269681|mm8_0 + GT + >canFam2.chr5(+):66213635-66213637|canFam2_0 + GT + + >hg18.chr1(-):100323677-100323679|hg18_1 + GT + >panTro2.chr1(-):101678671-101678673|panTro2_1 + GT + >rheMac2.chr1(-):103154011-103154013|rheMac2_1 + GT + >mm8.chr3(+):116620616-116620618|mm8_1 + GT + >canFam2.chr6(+):52954092-52954094|canFam2_1 + GT + + + +becomes:: + + >hg18 + GTGT + >panTro2 + GTGT + >rheMac2 + GTGT + >mm8 + GTGT + >canFam2 + GTGT + + +.. class:: warningmark + + This tool will only work properly on files with Galaxy style FASTA headers. + +</help> +</tool> \ No newline at end of file |
b |
diff -r 000000000000 -r 2126e1b833a2 test-data/cf_maf2fasta.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cf_maf2fasta.dat Mon May 19 12:33:30 2014 -0400 |
b |
b'@@ -0,0 +1,134 @@\n+>hg17.chr7(+):127471195-127471526|hg17_0\n+gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAA\n+>panTro1.chr6(+):129885076-129885407|panTro1_0\n+gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAA\n+>rheMac2.chr3(+):165787989-165788319|rheMac2_0\n+gcttgccatcttttgatgctcttgggaatccagcagctgtcaccat-taaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCATAGGCAtgagtcaggccatagtgctggacccacagaattatgagctaaataagtagtgttgggttaagtcactaagttttaggcatagtgtgttatgtagcTCACAAACATATAAGACTGTGTGTTTTTTGACTGGAGGAAGAGATGCCATAAAGACCACCTTTTGAAACTTCTCAAATACTGCCATTGATGTGCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAA\n+>rn3.chr4(+):56178191-56178473|rn3_0\n+CTTCACTCTCATTTGCTGTT----------------CTGTCACTATGGAGACAAACACAGGCTAGCCCAGTTACTATCTTGATCACAGCAGCTGT----CAGCTAGCTGCCACTCACAGGAATAAGGCCATACCATT-GATCCACTGAACCTTGATCTAGGAATTTGGC----------------------TGGGGCCAGTTTGCGGTGTCACTCATGA--CTCTAAGATTGTGTGTTTG----CTCCAGGAAGAGACGGCAAGAGGATTACCTTTAAAAGGTTCGG-AGTCTAGCTGTAGACAGCCCAATGGG---------------------------------------------------------TATAAC---------AATACTCACTAA\n+>mm7.chr6(+):28984529-28984886|mm7_0\n+CTCCACTCTCGTTTGCTGTT----------------CTGTCACCATGGAAACAAACG-AGGGTGGTCCAGTTACTATCTTG---ACTGCAGCTGG----CAGTCAGTTGCCACT--CAGGAATAAGGCTATGCCATT-GATCCACTGAACCGTGATCTGGAAACCTGGCTGTTGTTT-------CAAGCCTTGGGGCCAGTTTGCGGTGTTACTCATGA--CTCTAAGATCGTGTGCTTG----CTGCAGGAAGAGACAGCAAGGGGGTTACATTTAAAAAGCCCCC-AGTTTAGCTATAGGCAGGCCAACAGGTGTAAAAATACTCACTAGTAATGGGCTGAACTCATGGAGGTAGCATTAGTGAGACACTGTAACTGTTTTTTTAAAAATCACTAA\n+\n+>hg17.chr7(+):127471526-127471584|hg17_1\n+AATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG\n+>mm7.chr6(+):28984886-28984940|mm7_1\n+----AACGTTTCATTGATTGCTCATCATTTAAAAAAAGAAATTCCTCAGTGGAAGAGG\n+>rheMac2.chr3(+):165788319-165788377|rheMac2_1\n+AATTTGTGGTTTATTTATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG\n+>panTro1.chr6(+):129885407-129885465|panTro1_1\n+AATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG\n+\n+>hg17.chr7(+):127471584-127471688|hg17_2\n+GAGATATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggaattattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttt\n+>panTro1.chr6(+):129885465-129885569|panTro1_2\n+GAGACATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggagttattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttt\n+>rheMac2.chr3(+):165788377-165788482|rheMac2_2\n+GAGATATTT-GGggaaatttg-gtatagactagctt--tcatgatgtaagggagttatttttgtgtgataatggccctacagttac-acagaaattcttccttatttttt\n+>canFam2.chr14(-):11090703-11090811|canFam2_2\n+gagatattt-gggggaatttgaatgtagtgttgctcttttgtgatgctaagaaattataattgtctgatgatagtctcgtggttatgggggaaatgcttcctta-ttttt\n+>bosTau2.chr4(-):50243931-50244034|bosTau2_2\n+-agacattg-ggtaaaattcaaatgcagactagctc----atgatgttaaagaattactcttgtgtggtaatggtcttgtgatagagatagaaatgcttcctta-ttttt\n+>rn3.chr4(+):56182200-56182295|rn3_2\n+----TATTTGGGGGAAATATG-ATGTGCA----CTT--CCATGATCTTAAAGAATTGCTACTGTTTGATAGTGATCTTATGGTTAA-ATAAAAAAAAT--CTTA-GTTGT\n+>dasNov1.scaffold_256527(+):298-392|dasNov1_2\n+GAGACATTT-GGAGAAATTTG-----------Aatt--tcatgatgttaaggaattacttttgtatgatgatggtcttgtggctat-gtagaatttcttccgtg-tttta\n+\n+>hg17.chr7(+):127471688-127471871|hg17_3\n+tgggaagcaccaaagta-------gggataaaatgtcatgatgtgtgcaatacactttaaaatgtttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataattattaaatctaggt-----gatgggtatattgtagttcactatagtattgcaca'..b'TCCCAGTGGCGGTGAATCCGGAGGAATACGGAAACTGGGGC-GCACTACCATGACACGTGTCAAA-AATCAGTTCCGTGGTCCGTGGAGGGCCTGGGGTTC------GAAAATCTTGTCC-CGAGCACCCCCGTGCGCCTGGCACCGCGACAGTGACAGGACTGAAGCGTG-\n+\n+>hg17.chr7(+):127472258-127472280|hg17_7\n+gatggccca-atccctgtcctct-\n+>panTro1.chr6(+):129886139-129886161|panTro1_7\n+gatggccca-atccctgtcctct-\n+>rheMac2.chr3(+):165789069-165789091|rheMac2_7\n+gatggccca-atccctgtcctct-\n+>mm7.chr6(+):28991025-28991048|mm7_7\n+AATGGCAGAGGGCTCTGTTCTCT-\n+>rn3.chr4(+):56183879-56183902|rn3_7\n+AATGGCAGAGGCCCCTGTTCTCT-\n+>canFam2.chr14(-):11089526-11089548|canFam2_7\n+GGAGACTTG-ATGCCTGCCTTCC-\n+>dasNov1.scaffold_256527(+):964-987|dasNov1_7\n+GACGGCCAG-ACCTCTGCCCTCGG\n+\n+>hg17.chr7(+):127472280-127472681|hg17_8\n+taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGGCCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCACAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgcaaatttcctgt\n+>panTro1.chr6(+):129886161-129886562|panTro1_8\n+taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGACCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCGCAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgtaaatttcctgt\n+>rheMac2.chr3(+):165789091-165789492|rheMac2_8\n+taaaacctaatggaggagatggaATG-GGTCACCCAACCCGGACTGAGAGACAGGAATTAGCTGCAAGGGTAACCAGGACAAGCTTCTCTA---ATGATG--GAGAGACCCTA-GTGGAATGGGGAGATTCTTCTGGGAGAAGCGATGGATTCGTAGTTGGGCATCCCCACAGAGGGACTGGAAAGAAAAAAGACCTGGAGGAACCA------ATGTGC-AATGTATGTGTGTTTCCTGGTTcaagggctggcaaactttctcta--aagggccagatagaaaacattttaggctttgtaagccaagg---caaaatcgaggag-attacatgggtacttatacaacaagaataaacaatt---tccacaa--tttttattcacagaattcaaaa---ctttat----agacac---agaaatgtaaatttcctgt\n+>rn3.chr4(+):56183902-56184219|rn3_8\n+------------------------------------GTCCATAGTCAAAG------------------------------AAGCCTCTCAG---ATGGAG--AGCAGGGCCTATGCAAAAGAGGGGGCTTCTGTAGGCAGAAGGGATGGACTAGCCTCCGGACATAGCCATAGAGAGGCTGGCAGGACTGAGACCCAGGAGAAGCCAGCGCAGGTGTGCGGGCGTGTGTATATTTCATAGTTTGCAGGTTGG----------------------------CAAACAATTCCTGCTTTGCAGGCCAAGA---GGAAACTGAAGGTGACCCCGTGAGTGCTTAC---ACAAGAGAAAACAAG-------ACAA-TTTTTGGTTGACCAAATTCAGAA---CTTTATTTGAGGATGC---TAAAGTTTAAATTTCTTTT\n+>canFam2.chr14(-):11089143-11089523|canFam2_8\n+TACAGCCTGTGGGCAGAGGTGGGAAGAGGTCACGCAAGCCAGTTGGAATGAGGGGAGTTGGCTGGAAAGGTGACCAGGACAAGCTACTTCAACCAGGAAG--AAGAGACCCCG-GTG----------------CTTGGAGAAGGCCTGATTGAGCAGTCCTGCATGCCCGCCCAC-GACTGGCAGGAATAAAGACCCAGAAGAGCTA------ACGTGC-AATGTA------TTTTCTAGTTCCAgggttggcaaactttctctct-aagggtgggatgataaacattttaggcttttcagaccaaga---ggcgacatcagag-ggtatgtaggt---------acaagagggaaaagttgcccccggaa-ttttttg--gataaaattcaaaa---ctttacttagggatgc---caaaatgtaaacttcatat\n+>dasNov1.scaffold_256527(+):987-1401|dasNov1_8\n+CTAAATCTCGCGGAGAAGGTGGAACA-GGTTACCCAAACCCGACCGAG-GAGGCGAGTTG---GAAACGGCGACTGGGACAAGCTCCCTCA---GAGACGGAGAGAGACCCCA-GTGGAAGGGGGGAGAGGCTCTTAGGGAAACGATGGGGGGACCCGCCCGCACCCGCACAGAGGCGCTGGCAGGCACAGCGGCCCCGAGGAGCCC------AGGAGC-AGGGC-TGTGT-TCCCCTGCATcaggggttggcaaactttttctgcaaagggccagatagtaaatattttaggctttgcaaaccaagaagtagaaagggaggcc-attatgtacgtatttatatagcaagagagaacattt---cccacaatttttttattgacagaatttaaaacttctttattgatgaacaccaaagaaacttgaatttcatat\n+\n+>hg17.chr7(+):127472681-127472715|hg17_9\n+aattttcccat---gagaactattcttcttttgtttt\n+>rheMac2.chr3(+):165789492-165789526|rheMac2_9\n+aattttcacat---aagaactattcttcttttgtttt\n+>panTro1.chr6(+):129886562-129886596|panTro1_9\n+aattttcccgt---gagaactattcttcttttgtttt\n+>canFam2.chr14(-):11089108-11089143|canFam2_9\n+aatggtcatgt--ccataactattcttcttttatttt\n+>dasNov1.scaffold_256527(+):1401-1433|dasNov1_9\n+aattttcacatatcacgaagtatttttttttt-----\n+\n' |
b |
diff -r 000000000000 -r 2126e1b833a2 test-data/fasta_concatenate_out.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fasta_concatenate_out.fasta Mon May 19 12:33:30 2014 -0400 |
b |
b'@@ -0,0 +1,16 @@\n+>hg17\n+gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAAAATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGGGAGATATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggaattattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttttgggaagcaccaaagta-------gggataaaatgtcatgatgtgtgcaatacactttaaaatgtttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataattattaaatctaggt-----gatgggtatattgtagttcactatagtattgcacacttttctgtatgtttaaatttttcattta--------------------------aaaa-actttgagctagacaccaggctatgagcta-ggagcatagcaatgaccaa----------------------------------------------------------------------------------------------atagactcctaccaa-ctc-aaagaatgcacattctCTG-GGAAACATGTTTCCATTAGGAAGCCTCGAATGCAATGTGACTGTGGTCTCCAGGACCTG-TGTGATCCTGGCTTTTCCTGTTCCCTCCG---CATCATCACTGCAGGTGTGTTTTCCCAAGTTTTAAA------CATTTACCTTCCCAGTGGCCTTGCGTCTAGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTCTATGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-ccc-tgggcacccatgtgttcttggcactggaaaagtaccgggactgaaacagttgatggccca-atccctgtcctct-taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGGCCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCACAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgcaaatttcctgtaattttcccat---gagaactattcttcttttgtttt\n+>panTro1\n+gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAAAATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGGGAGACATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggagttattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttttgggaaacaccaaagta-------gggataaaatgtcatgatgtgtgcaatacgctttaaaatatttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataattattaaatctaggt-----gatgggtatattgtagttcactatagtattgcacacttttctgtatgtttaaaattttcattta--------------------------aaaa-actttgagctagacaccaggctatgagcta-ggagcatagcaatgaccaa----------------------------------------------------------------------------------------------atagactcctaccaa-ctc-aaagaatgcacattctCTG-GGAAACATGTTTCCATTAGGAAGCCTCGAATGCAATGTGACTGTGGTCTCCAGGACATG-TGTGATCCTGGCTTTTCCTGTTCCCTCTG---CATCATCACTGCAGGTGTATTTTCCCAAGTTTTAAA------CATTTACCTTCCCAGTGGCCTTGCGTCTAGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTCTATGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-tcc-tgggcacccatgtgttcttggcactggaaaagtaccgggactgaaacagttgatggccca-atccctgtcctct-taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGACCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCGCAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgtaaatttcctgtaattttcccgt---gagaactattcttcttttgtttt\n+>rheMac2\n+gcttgccatcttttgatgctcttgggaatccagcagctgtcaccat-taaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCATAGGCAtgagtcaggccatagtgctggacccacagaattatgagctaaataagtagtgttgggttaagtcactaagttttaggcatagtgtgttatgtagcTCACAAACATATAAGACTGTGTGTTTTTTGACTGGAGGAAGAGATGCCATAAAGACCACCTTTTGAAACTTCTCAAATACTGCCATTGA'..b'GATTGAGCAGTCCTGCATGCCCGCCCAC-GACTGGCAGGAATAAAGACCCAGAAGAGCTA------ACGTGC-AATGTA------TTTTCTAGTTCCAgggttggcaaactttctctct-aagggtgggatgataaacattttaggcttttcagaccaaga---ggcgacatcagag-ggtatgtaggt---------acaagagggaaaagttgcccccggaa-ttttttg--gataaaattcaaaa---ctttacttagggatgc---caaaatgtaaacttcatataatggtcatgt--ccataactattcttcttttatttt\n+>bosTau2\n+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------agacattg-ggtaaaattcaaatgcagactagctc----atgatgttaaagaattactcttgtgtggtaatggtcttgtgatagagatagaaatgcttcctta-tttttagataaacacttaagtattta---aggatgaaacgccctgatgtttgtaatttgctttagaatattttagccaaaa----------gaattaa-------------------------tgatgc--aaatatg--caaaaagagta--cgttaaacctaa-----------------------------------------------------atttgCGATTttcattta--------------------------aaaa-tcttcgtgcaacgcacggggctatcaatgt-gggatacagatgtgaacaa---------------------------------------------------------------------------------------------aacggacccgtgtgggactcggcggagcacacagattttgcgggagCACGTTCCCGTTAGGAAGTCTCTGATGCAATACGACCGGTGCCTTCAGGACCTG-TG--AGGCTGACTTTCCTTA-CCCCTCCACACCATCATCAAGGCAGGTGTGATTTTCCAGG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n+>dasNov1\n+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GAGACATTT-GGAGAAATTTG-----------Aatt--tcatgatgttaaggaattacttttgtatgatgatggtcttgtggctat-gtagaatttcttccgtg-ttttaagacgcatgctgaagcatgta---aggataaaatgtcgtggtgtttgtaatttattctaaaacattttagccaaaaacaaataaataaataaa-------------------------tgaagc--aaatatgggggaaatgtttaattgttaaatctagatttaacacggtatataccgtgcttcattatactagtctctacttttccatgtgtttgaaattttCATTAAAATGTTTGTTTGTTGTCTGTTTTAATGAAATCCTTTGTGCTAGCCACTGGGATGAAAGCTAGGGAACACAGCAGTGAGCAA-----------------------------------------------------------------------------------------------CAGCCTGGCTCCGT-CC--GGGGGCCGCTCAGCAGCTC-GGGAGCGTGGAGACG---GGAAGTCTGTCACGCGATGCG-----------CTGGGCCCG------------CTGTTCCCGCCCCCCTCC---CCCC----------------TTTCCCAAGTTTTAAA------AATTTACCTTCCCAGTGGCGGTGAATCCGGAGGAATACGGAAACTGGGGC-GCACTACCATGACACGTGTCAAA-AATCAGTTCCGTGGTCCGTGGAGGGCCTGGGGTTC------GAAAATCTTGTCC-CGAGCACCCCCGTGCGCCTGGCACCGCGACAGTGACAGGACTGAAGCGTG-GACGGCCAG-ACCTCTGCCCTCGGCTAAATCTCGCGGAGAAGGTGGAACA-GGTTACCCAAACCCGACCGAG-GAGGCGAGTTG---GAAACGGCGACTGGGACAAGCTCCCTCA---GAGACGGAGAGAGACCCCA-GTGGAAGGGGGGAGAGGCTCTTAGGGAAACGATGGGGGGACCCGCCCGCACCCGCACAGAGGCGCTGGCAGGCACAGCGGCCCCGAGGAGCCC------AGGAGC-AGGGC-TGTGT-TCCCCTGCATcaggggttggcaaactttttctgcaaagggccagatagtaaatattttaggctttgcaaaccaagaagtagaaagggaggcc-attatgtacgtatttatatagcaagagagaacattt---cccacaatttttttattgacagaatttaaaacttctttattgatgaacaccaaagaaacttgaatttcatataattttcacatatcacgaagtatttttttttt-----\n' |
b |
diff -r 000000000000 -r 2126e1b833a2 utils/maf_utilities.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/maf_utilities.py Mon May 19 12:33:30 2014 -0400 |
[ |
b'@@ -0,0 +1,601 @@\n+#!/usr/bin/env python\n+"""\n+Provides wrappers and utilities for working with MAF files and alignments.\n+"""\n+#Dan Blankenberg\n+import pkg_resources; pkg_resources.require( "bx-python" )\n+import bx.align.maf\n+import bx.intervals\n+import bx.interval_index_file\n+import sys, os, string, tempfile\n+import logging\n+from copy import deepcopy\n+\n+assert sys.version_info[:2] >= ( 2, 4 )\n+\n+log = logging.getLogger(__name__)\n+\n+\n+GAP_CHARS = [ \'-\' ]\n+SRC_SPLIT_CHAR = \'.\'\n+\n+def src_split( src ):\n+ fields = src.split( SRC_SPLIT_CHAR, 1 )\n+ spec = fields.pop( 0 )\n+ if fields:\n+ chrom = fields.pop( 0 )\n+ else:\n+ chrom = spec\n+ return spec, chrom\n+\n+def src_merge( spec, chrom, contig = None ):\n+ if None in [ spec, chrom ]:\n+ spec = chrom = spec or chrom\n+ return bx.align.maf.src_merge( spec, chrom, contig )\n+\n+def get_species_in_block( block ):\n+ species = []\n+ for c in block.components:\n+ spec, chrom = src_split( c.src )\n+ if spec not in species:\n+ species.append( spec )\n+ return species\n+\n+def tool_fail( msg = "Unknown Error" ):\n+ print >> sys.stderr, "Fatal Error: %s" % msg\n+ sys.exit()\n+\n+#an object corresponding to a reference layered alignment\n+class RegionAlignment( object ):\n+\n+ DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )\n+ MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed\n+\n+ def __init__( self, size, species = [] ):\n+ assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE )\n+ self.size = size\n+ self.sequences = {}\n+ if not isinstance( species, list ):\n+ species = [species]\n+ for spec in species:\n+ self.add_species( spec )\n+\n+ #add a species to the alignment\n+ def add_species( self, species ):\n+ #make temporary sequence files\n+ self.sequences[species] = tempfile.TemporaryFile()\n+ self.sequences[species].write( "-" * self.size )\n+\n+ #returns the names for species found in alignment, skipping names as requested\n+ def get_species_names( self, skip = [] ):\n+ if not isinstance( skip, list ): skip = [skip]\n+ names = self.sequences.keys()\n+ for name in skip:\n+ try: names.remove( name )\n+ except: pass\n+ return names\n+\n+ #returns the sequence for a species\n+ def get_sequence( self, species ):\n+ self.sequences[species].seek( 0 )\n+ return self.sequences[species].read()\n+\n+ #returns the reverse complement of the sequence for a species\n+ def get_sequence_reverse_complement( self, species ):\n+ complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]\n+ complement.reverse()\n+ return "".join( complement )\n+\n+ #sets a position for a species\n+ def set_position( self, index, species, base ):\n+ if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." )\n+ return self.set_range( index, species, base )\n+ #sets a range for a species\n+ def set_range( self, index, species, bases ):\n+ if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) )\n+ if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." )\n+ if species not in self.sequences.keys(): self.add_species( species )\n+ self.sequences[species].seek( index )\n+ self.sequences[species].write( bases )\n+\n+ #Flush temp file of specified species, or all species\n+ def flush( self, species = None ):\n+ if species is None:\n+ species = self.sequences.keys()\n+ elif not isinstance( species, list ):\n+ species = [species]\n+ for spec in species:\n+ self.sequences[spec].flush()\n+\n+class GenomicRegionAlignment( R'..b' for c in block.components:\n+ if c.src == src:\n+ yield c\n+\n+def get_components_by_src( block, src ):\n+ return [ value for value in iter_components_by_src( block, src ) ]\n+\n+def iter_components_by_src_start( block, src ):\n+ for c in block.components:\n+ if c.src.startswith( src ):\n+ yield c\n+\n+def get_components_by_src_start( block, src ):\n+ return [ value for value in iter_components_by_src_start( block, src ) ]\n+\n+def sort_block_components_by_block( block1, block2 ):\n+ #orders the components in block1 by the index of the component in block2\n+ #block1 must be a subset of block2\n+ #occurs in-place\n+ return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) )\n+\n+def get_species_in_maf( maf_filename ):\n+ species = []\n+ for block in bx.align.maf.Reader( open( maf_filename ) ):\n+ for spec in get_species_in_block( block ):\n+ if spec not in species:\n+ species.append( spec )\n+ return species\n+\n+def parse_species_option( species ):\n+ if species:\n+ species = species.split( \',\' )\n+ if \'None\' not in species:\n+ return species\n+ return None #provided species was \'\', None, or had \'None\' in it\n+\n+def remove_temp_index_file( index_filename ):\n+ try: os.unlink( index_filename )\n+ except: pass\n+\n+#Below are methods to deal with FASTA files\n+\n+def get_fasta_header( component, attributes = {}, suffix = None ):\n+ header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() )\n+ for key, value in attributes.iteritems():\n+ header = "%s%s=%s|" % ( header, key, value )\n+ if suffix:\n+ header = "%s%s" % ( header, suffix )\n+ else:\n+ header = "%s%s" % ( header, src_split( component.src )[ 0 ] )\n+ return header\n+\n+def get_attributes_from_fasta_header( header ):\n+ if not header: return {}\n+ attributes = {}\n+ header = header.lstrip( \'>\' )\n+ header = header.strip()\n+ fields = header.split( \'|\' )\n+ try:\n+ region = fields[0]\n+ region = region.split( \'(\', 1 )\n+ temp = region[0].split( \'.\', 1 )\n+ attributes[\'species\'] = temp[0]\n+ if len( temp ) == 2:\n+ attributes[\'chrom\'] = temp[1]\n+ else:\n+ attributes[\'chrom\'] = temp[0]\n+ region = region[1].split( \')\', 1 )\n+ attributes[\'strand\'] = region[0]\n+ region = region[1].lstrip( \':\' ).split( \'-\' )\n+ attributes[\'start\'] = int( region[0] )\n+ attributes[\'end\'] = int( region[1] )\n+ except:\n+ #fields 0 is not a region coordinate\n+ pass\n+ if len( fields ) > 2:\n+ for i in xrange( 1, len( fields ) - 1 ):\n+ prop = fields[i].split( \'=\', 1 )\n+ if len( prop ) == 2:\n+ attributes[ prop[0] ] = prop[1]\n+ if len( fields ) > 1:\n+ attributes[\'__suffix__\'] = fields[-1]\n+ return attributes\n+\n+def iter_fasta_alignment( filename ):\n+ class fastaComponent:\n+ def __init__( self, species, text = "" ):\n+ self.species = species\n+ self.text = text\n+ def extend( self, text ):\n+ self.text = self.text + text.replace( \'\\n\', \'\' ).replace( \'\\r\', \'\' ).strip()\n+ #yields a list of fastaComponents for a FASTA file\n+ f = open( filename, \'rb\' )\n+ components = []\n+ #cur_component = None\n+ while True:\n+ line = f.readline()\n+ if not line:\n+ if components:\n+ yield components\n+ return\n+ line = line.strip()\n+ if not line:\n+ if components:\n+ yield components\n+ components = []\n+ elif line.startswith( \'>\' ):\n+ attributes = get_attributes_from_fasta_header( line )\n+ components.append( fastaComponent( attributes[\'species\'] ) )\n+ elif components:\n+ components[-1].extend( line )\n+\n' |
b |
diff -r 000000000000 -r 2126e1b833a2 utils/odict.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/odict.py Mon May 19 12:33:30 2014 -0400 |
[ |
@@ -0,0 +1,85 @@ +""" +Ordered dictionary implementation. +""" + +from UserDict import UserDict + +class odict(UserDict): + """ + http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/107747 + + This dictionary class extends UserDict to record the order in which items are + added. Calling keys(), values(), items(), etc. will return results in this + order. + """ + def __init__( self, dict = None ): + self._keys = [] + UserDict.__init__( self, dict ) + + def __delitem__( self, key ): + UserDict.__delitem__( self, key ) + self._keys.remove( key ) + + def __setitem__( self, key, item ): + UserDict.__setitem__( self, key, item ) + if key not in self._keys: + self._keys.append( key ) + + def clear( self ): + UserDict.clear( self ) + self._keys = [] + + def copy(self): + new = odict() + new.update( self ) + return new + + def items( self ): + return zip( self._keys, self.values() ) + + def keys( self ): + return self._keys[:] + + def popitem( self ): + try: + key = self._keys[-1] + except IndexError: + raise KeyError( 'dictionary is empty' ) + val = self[ key ] + del self[ key ] + return ( key, val ) + + def setdefault( self, key, failobj=None ): + if key not in self._keys: + self._keys.append( key ) + return UserDict.setdefault( self, key, failobj ) + + def update( self, dict ): + for ( key, val ) in dict.items(): + self.__setitem__( key, val ) + + def values( self ): + return map( self.get, self._keys ) + + def iterkeys( self ): + return iter( self._keys ) + + def itervalues( self ): + for key in self._keys: + yield self.get( key ) + + def iteritems( self ): + for key in self._keys: + yield key, self.get( key ) + + def __iter__( self ): + for key in self._keys: + yield key + + def reverse( self ): + self._keys.reverse() + + def insert( self, index, key, item ): + if key not in self._keys: + self._keys.insert( index, key ) + UserDict.__setitem__( self, key, item ) |