Repository 'fasta_concatenate_by_species'
hg clone https://toolshed.g2.bx.psu.edu/repos/devteam/fasta_concatenate_by_species

Changeset 0:2126e1b833a2 (2014-05-19)
Next changeset 1:717aee069681 (2014-11-17)
Commit message:
Imported from capsule None
added:
fasta_concatenate_by_species.py
fasta_concatenate_by_species.xml
test-data/cf_maf2fasta.dat
test-data/fasta_concatenate_out.fasta
utils/__init__.py
utils/maf_utilities.py
utils/odict.py
b
diff -r 000000000000 -r 2126e1b833a2 fasta_concatenate_by_species.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_concatenate_by_species.py Mon May 19 12:33:30 2014 -0400
[
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+"""
+Takes a Multiple Alignment FASTA file and concatenates 
+sequences for each species, resulting in one sequence 
+alignment per species.
+"""
+
+import sys, tempfile
+from utils.maf_utilities import iter_fasta_alignment
+from utils.odict import odict
+
+def __main__():
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    species = odict()
+    cur_size = 0
+    for components in iter_fasta_alignment( input_filename ):
+        species_not_written = species.keys()
+        for component in components:
+            if component.species not in species:
+                species[component.species] = tempfile.TemporaryFile()
+                species[component.species].write( "-" * cur_size )
+            species[component.species].write( component.text )
+            try:
+                species_not_written.remove( component.species )
+            except ValueError:
+                #this is a new species
+                pass
+        for spec in species_not_written:
+            species[spec].write( "-" * len( components[0].text ) )
+        cur_size += len( components[0].text )
+    out = open( output_filename, 'wb' )
+    for spec, f in species.iteritems():
+        f.seek( 0 )
+        out.write( ">%s\n%s\n" % ( spec, f.read() ) )
+    out.close()
+
+if __name__ == "__main__" : __main__()
b
diff -r 000000000000 -r 2126e1b833a2 fasta_concatenate_by_species.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_concatenate_by_species.xml Mon May 19 12:33:30 2014 -0400
b
@@ -0,0 +1,72 @@
+<tool id="fasta_concatenate0" name="Concatenate" version="0.0.0">
+  <description>FASTA alignment by species</description>
+  <command interpreter="python">fasta_concatenate_by_species.py $input1 $out_file1</command>
+  <inputs>
+    <param name="input1" type="data" format="fasta" label="FASTA alignment"/>
+  </inputs>
+  <outputs>
+    <data name="out_file1" format="fasta"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input1" value="cf_maf2fasta.dat" />
+      <output name="out_file1" file="fasta_concatenate_out.fasta" />
+    </test>
+  </tests>
+  <help>
+  
+**What it does**
+  
+This tools attempts to parse FASTA headers to determine the species for each sequence in a multiple FASTA alignment.
+It then linearly concatenates the sequences for each species in the file, creating one sequence per determined species.
+
+-------
+
+**Example**
+
+Starting FASTA::
+  
+  >hg18.chr1(+):10016339-10016341|hg18_0
+  GT
+  >panTro2.chr1(+):10195380-10195382|panTro2_0
+  GT
+  >rheMac2.chr1(+):13119747-13119749|rheMac2_0
+  GT
+  >mm8.chr4(-):148269679-148269681|mm8_0
+  GT
+  >canFam2.chr5(+):66213635-66213637|canFam2_0
+  GT
+  
+  >hg18.chr1(-):100323677-100323679|hg18_1
+  GT
+  >panTro2.chr1(-):101678671-101678673|panTro2_1
+  GT
+  >rheMac2.chr1(-):103154011-103154013|rheMac2_1
+  GT
+  >mm8.chr3(+):116620616-116620618|mm8_1
+  GT
+  >canFam2.chr6(+):52954092-52954094|canFam2_1
+  GT
+  
+
+
+becomes::
+  
+  >hg18
+  GTGT
+  >panTro2
+  GTGT
+  >rheMac2
+  GTGT
+  >mm8
+  GTGT
+  >canFam2
+  GTGT
+
+
+.. class:: warningmark 
+
+ This tool will only work properly on files with Galaxy style FASTA headers.
+
+</help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 2126e1b833a2 test-data/cf_maf2fasta.dat
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cf_maf2fasta.dat Mon May 19 12:33:30 2014 -0400
b
b'@@ -0,0 +1,134 @@\n+>hg17.chr7(+):127471195-127471526|hg17_0\n+gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAA\n+>panTro1.chr6(+):129885076-129885407|panTro1_0\n+gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAA\n+>rheMac2.chr3(+):165787989-165788319|rheMac2_0\n+gcttgccatcttttgatgctcttgggaatccagcagctgtcaccat-taaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCATAGGCAtgagtcaggccatagtgctggacccacagaattatgagctaaataagtagtgttgggttaagtcactaagttttaggcatagtgtgttatgtagcTCACAAACATATAAGACTGTGTGTTTTTTGACTGGAGGAAGAGATGCCATAAAGACCACCTTTTGAAACTTCTCAAATACTGCCATTGATGTGCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAA\n+>rn3.chr4(+):56178191-56178473|rn3_0\n+CTTCACTCTCATTTGCTGTT----------------CTGTCACTATGGAGACAAACACAGGCTAGCCCAGTTACTATCTTGATCACAGCAGCTGT----CAGCTAGCTGCCACTCACAGGAATAAGGCCATACCATT-GATCCACTGAACCTTGATCTAGGAATTTGGC----------------------TGGGGCCAGTTTGCGGTGTCACTCATGA--CTCTAAGATTGTGTGTTTG----CTCCAGGAAGAGACGGCAAGAGGATTACCTTTAAAAGGTTCGG-AGTCTAGCTGTAGACAGCCCAATGGG---------------------------------------------------------TATAAC---------AATACTCACTAA\n+>mm7.chr6(+):28984529-28984886|mm7_0\n+CTCCACTCTCGTTTGCTGTT----------------CTGTCACCATGGAAACAAACG-AGGGTGGTCCAGTTACTATCTTG---ACTGCAGCTGG----CAGTCAGTTGCCACT--CAGGAATAAGGCTATGCCATT-GATCCACTGAACCGTGATCTGGAAACCTGGCTGTTGTTT-------CAAGCCTTGGGGCCAGTTTGCGGTGTTACTCATGA--CTCTAAGATCGTGTGCTTG----CTGCAGGAAGAGACAGCAAGGGGGTTACATTTAAAAAGCCCCC-AGTTTAGCTATAGGCAGGCCAACAGGTGTAAAAATACTCACTAGTAATGGGCTGAACTCATGGAGGTAGCATTAGTGAGACACTGTAACTGTTTTTTTAAAAATCACTAA\n+\n+>hg17.chr7(+):127471526-127471584|hg17_1\n+AATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG\n+>mm7.chr6(+):28984886-28984940|mm7_1\n+----AACGTTTCATTGATTGCTCATCATTTAAAAAAAGAAATTCCTCAGTGGAAGAGG\n+>rheMac2.chr3(+):165788319-165788377|rheMac2_1\n+AATTTGTGGTTTATTTATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG\n+>panTro1.chr6(+):129885407-129885465|panTro1_1\n+AATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGG\n+\n+>hg17.chr7(+):127471584-127471688|hg17_2\n+GAGATATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggaattattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttt\n+>panTro1.chr6(+):129885465-129885569|panTro1_2\n+GAGACATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggagttattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttt\n+>rheMac2.chr3(+):165788377-165788482|rheMac2_2\n+GAGATATTT-GGggaaatttg-gtatagactagctt--tcatgatgtaagggagttatttttgtgtgataatggccctacagttac-acagaaattcttccttatttttt\n+>canFam2.chr14(-):11090703-11090811|canFam2_2\n+gagatattt-gggggaatttgaatgtagtgttgctcttttgtgatgctaagaaattataattgtctgatgatagtctcgtggttatgggggaaatgcttcctta-ttttt\n+>bosTau2.chr4(-):50243931-50244034|bosTau2_2\n+-agacattg-ggtaaaattcaaatgcagactagctc----atgatgttaaagaattactcttgtgtggtaatggtcttgtgatagagatagaaatgcttcctta-ttttt\n+>rn3.chr4(+):56182200-56182295|rn3_2\n+----TATTTGGGGGAAATATG-ATGTGCA----CTT--CCATGATCTTAAAGAATTGCTACTGTTTGATAGTGATCTTATGGTTAA-ATAAAAAAAAT--CTTA-GTTGT\n+>dasNov1.scaffold_256527(+):298-392|dasNov1_2\n+GAGACATTT-GGAGAAATTTG-----------Aatt--tcatgatgttaaggaattacttttgtatgatgatggtcttgtggctat-gtagaatttcttccgtg-tttta\n+\n+>hg17.chr7(+):127471688-127471871|hg17_3\n+tgggaagcaccaaagta-------gggataaaatgtcatgatgtgtgcaatacactttaaaatgtttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataattattaaatctaggt-----gatgggtatattgtagttcactatagtattgcaca'..b'TCCCAGTGGCGGTGAATCCGGAGGAATACGGAAACTGGGGC-GCACTACCATGACACGTGTCAAA-AATCAGTTCCGTGGTCCGTGGAGGGCCTGGGGTTC------GAAAATCTTGTCC-CGAGCACCCCCGTGCGCCTGGCACCGCGACAGTGACAGGACTGAAGCGTG-\n+\n+>hg17.chr7(+):127472258-127472280|hg17_7\n+gatggccca-atccctgtcctct-\n+>panTro1.chr6(+):129886139-129886161|panTro1_7\n+gatggccca-atccctgtcctct-\n+>rheMac2.chr3(+):165789069-165789091|rheMac2_7\n+gatggccca-atccctgtcctct-\n+>mm7.chr6(+):28991025-28991048|mm7_7\n+AATGGCAGAGGGCTCTGTTCTCT-\n+>rn3.chr4(+):56183879-56183902|rn3_7\n+AATGGCAGAGGCCCCTGTTCTCT-\n+>canFam2.chr14(-):11089526-11089548|canFam2_7\n+GGAGACTTG-ATGCCTGCCTTCC-\n+>dasNov1.scaffold_256527(+):964-987|dasNov1_7\n+GACGGCCAG-ACCTCTGCCCTCGG\n+\n+>hg17.chr7(+):127472280-127472681|hg17_8\n+taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGGCCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCACAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgcaaatttcctgt\n+>panTro1.chr6(+):129886161-129886562|panTro1_8\n+taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGACCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCGCAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgtaaatttcctgt\n+>rheMac2.chr3(+):165789091-165789492|rheMac2_8\n+taaaacctaatggaggagatggaATG-GGTCACCCAACCCGGACTGAGAGACAGGAATTAGCTGCAAGGGTAACCAGGACAAGCTTCTCTA---ATGATG--GAGAGACCCTA-GTGGAATGGGGAGATTCTTCTGGGAGAAGCGATGGATTCGTAGTTGGGCATCCCCACAGAGGGACTGGAAAGAAAAAAGACCTGGAGGAACCA------ATGTGC-AATGTATGTGTGTTTCCTGGTTcaagggctggcaaactttctcta--aagggccagatagaaaacattttaggctttgtaagccaagg---caaaatcgaggag-attacatgggtacttatacaacaagaataaacaatt---tccacaa--tttttattcacagaattcaaaa---ctttat----agacac---agaaatgtaaatttcctgt\n+>rn3.chr4(+):56183902-56184219|rn3_8\n+------------------------------------GTCCATAGTCAAAG------------------------------AAGCCTCTCAG---ATGGAG--AGCAGGGCCTATGCAAAAGAGGGGGCTTCTGTAGGCAGAAGGGATGGACTAGCCTCCGGACATAGCCATAGAGAGGCTGGCAGGACTGAGACCCAGGAGAAGCCAGCGCAGGTGTGCGGGCGTGTGTATATTTCATAGTTTGCAGGTTGG----------------------------CAAACAATTCCTGCTTTGCAGGCCAAGA---GGAAACTGAAGGTGACCCCGTGAGTGCTTAC---ACAAGAGAAAACAAG-------ACAA-TTTTTGGTTGACCAAATTCAGAA---CTTTATTTGAGGATGC---TAAAGTTTAAATTTCTTTT\n+>canFam2.chr14(-):11089143-11089523|canFam2_8\n+TACAGCCTGTGGGCAGAGGTGGGAAGAGGTCACGCAAGCCAGTTGGAATGAGGGGAGTTGGCTGGAAAGGTGACCAGGACAAGCTACTTCAACCAGGAAG--AAGAGACCCCG-GTG----------------CTTGGAGAAGGCCTGATTGAGCAGTCCTGCATGCCCGCCCAC-GACTGGCAGGAATAAAGACCCAGAAGAGCTA------ACGTGC-AATGTA------TTTTCTAGTTCCAgggttggcaaactttctctct-aagggtgggatgataaacattttaggcttttcagaccaaga---ggcgacatcagag-ggtatgtaggt---------acaagagggaaaagttgcccccggaa-ttttttg--gataaaattcaaaa---ctttacttagggatgc---caaaatgtaaacttcatat\n+>dasNov1.scaffold_256527(+):987-1401|dasNov1_8\n+CTAAATCTCGCGGAGAAGGTGGAACA-GGTTACCCAAACCCGACCGAG-GAGGCGAGTTG---GAAACGGCGACTGGGACAAGCTCCCTCA---GAGACGGAGAGAGACCCCA-GTGGAAGGGGGGAGAGGCTCTTAGGGAAACGATGGGGGGACCCGCCCGCACCCGCACAGAGGCGCTGGCAGGCACAGCGGCCCCGAGGAGCCC------AGGAGC-AGGGC-TGTGT-TCCCCTGCATcaggggttggcaaactttttctgcaaagggccagatagtaaatattttaggctttgcaaaccaagaagtagaaagggaggcc-attatgtacgtatttatatagcaagagagaacattt---cccacaatttttttattgacagaatttaaaacttctttattgatgaacaccaaagaaacttgaatttcatat\n+\n+>hg17.chr7(+):127472681-127472715|hg17_9\n+aattttcccat---gagaactattcttcttttgtttt\n+>rheMac2.chr3(+):165789492-165789526|rheMac2_9\n+aattttcacat---aagaactattcttcttttgtttt\n+>panTro1.chr6(+):129886562-129886596|panTro1_9\n+aattttcccgt---gagaactattcttcttttgtttt\n+>canFam2.chr14(-):11089108-11089143|canFam2_9\n+aatggtcatgt--ccataactattcttcttttatttt\n+>dasNov1.scaffold_256527(+):1401-1433|dasNov1_9\n+aattttcacatatcacgaagtatttttttttt-----\n+\n'
b
diff -r 000000000000 -r 2126e1b833a2 test-data/fasta_concatenate_out.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fasta_concatenate_out.fasta Mon May 19 12:33:30 2014 -0400
b
b'@@ -0,0 +1,16 @@\n+>hg17\n+gtttgccatcttttgctgctctagggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTAAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAAAATTTGTGGTTTATTCATTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGGGAGATATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggaattattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttttgggaagcaccaaagta-------gggataaaatgtcatgatgtgtgcaatacactttaaaatgtttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataattattaaatctaggt-----gatgggtatattgtagttcactatagtattgcacacttttctgtatgtttaaatttttcattta--------------------------aaaa-actttgagctagacaccaggctatgagcta-ggagcatagcaatgaccaa----------------------------------------------------------------------------------------------atagactcctaccaa-ctc-aaagaatgcacattctCTG-GGAAACATGTTTCCATTAGGAAGCCTCGAATGCAATGTGACTGTGGTCTCCAGGACCTG-TGTGATCCTGGCTTTTCCTGTTCCCTCCG---CATCATCACTGCAGGTGTGTTTTCCCAAGTTTTAAA------CATTTACCTTCCCAGTGGCCTTGCGTCTAGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTCTATGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-ccc-tgggcacccatgtgttcttggcactggaaaagtaccgggactgaaacagttgatggccca-atccctgtcctct-taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGGCCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCACAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgcaaatttcctgtaattttcccat---gagaactattcttcttttgtttt\n+>panTro1\n+gtttgccatcttttgctgctcttgggaatccagcagctgtcaccatgtaaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCACAGGCAtgagtcaggccatattgctggacccacagaattatgagctaaataaatagtcttgggttaagccactaagttttaggcatagtgtgttatgtaTCTCACAAACATATAAGACTGTGTGTTTGTTGACTGGAGGAAGAGATGCTATAAAGACCACCTTTTGAAACTTCCCAAATACTGCCACTGATGTCCTGATGGAGG-------------------------------------------------------TATGAA---------AACATCCACTAAAATTTGTGGTTTATTCGTTTTTCATTATTTTGTTTAAGGAGGTCTATAGTGGAAGAGGGAGACATTT-GGggaaatttt-gtatagactagctt--tcacgatgttagggagttattattgtgtgataatggtcttgcagttac-acagaaattcttcctta-ttttttgggaaacaccaaagta-------gggataaaatgtcatgatgtgtgcaatacgctttaaaatatttttgccaaaa----------taattaa-------------------------tgaagc--aaatatg---gaaaataataattattaaatctaggt-----gatgggtatattgtagttcactatagtattgcacacttttctgtatgtttaaaattttcattta--------------------------aaaa-actttgagctagacaccaggctatgagcta-ggagcatagcaatgaccaa----------------------------------------------------------------------------------------------atagactcctaccaa-ctc-aaagaatgcacattctCTG-GGAAACATGTTTCCATTAGGAAGCCTCGAATGCAATGTGACTGTGGTCTCCAGGACATG-TGTGATCCTGGCTTTTCCTGTTCCCTCTG---CATCATCACTGCAGGTGTATTTTCCCAAGTTTTAAA------CATTTACCTTCCCAGTGGCCTTGCGTCTAGAGGAATCCCTGTATAGTGGT-ACATGAATATAACACATAACAAA-AATCATCTCTATGGTGTGTGTTGTTCCTGGGGTTCAattcagcaaatttt-tcc-tgggcacccatgtgttcttggcactggaaaagtaccgggactgaaacagttgatggccca-atccctgtcctct-taaaacctaagggaggagaTGGAAAG-GGGCACCCAACCCAGACTGAGAGACAGGAATTAGCTGCAAGGGGAACTAGGAAAAGCTTCTTTA---AGGATG--GAGAGACCCTA-GTGGAATGGGGAGATTCTTCCGGGAGAAGCGATGGATGCGCAGTTGGGCATCCCCACAGACGGACTGGAAAGAAAAAAGGCCTGGAGGAATCA------ATGTGC-AATGTATGTGTGTTCCCTGGTTcaagggctgg-gaactttctcta--aagggccaggtagaaaacattttaggctttctaagccaagg---caaaattgaggat-attacatgggtacttatacaacaagaataaacaatt---tacacaa-ttttttgttgacagaattcaaaa---ctttat----agacac---agaaatgtaaatttcctgtaattttcccgt---gagaactattcttcttttgtttt\n+>rheMac2\n+gcttgccatcttttgatgctcttgggaatccagcagctgtcaccat-taaacaagcccaggctagaccaGTTACCCTCATC---ATCTTAGCTGATAGCCAGCCAGCCACCATAGGCAtgagtcaggccatagtgctggacccacagaattatgagctaaataagtagtgttgggttaagtcactaagttttaggcatagtgtgttatgtagcTCACAAACATATAAGACTGTGTGTTTTTTGACTGGAGGAAGAGATGCCATAAAGACCACCTTTTGAAACTTCTCAAATACTGCCATTGA'..b'GATTGAGCAGTCCTGCATGCCCGCCCAC-GACTGGCAGGAATAAAGACCCAGAAGAGCTA------ACGTGC-AATGTA------TTTTCTAGTTCCAgggttggcaaactttctctct-aagggtgggatgataaacattttaggcttttcagaccaaga---ggcgacatcagag-ggtatgtaggt---------acaagagggaaaagttgcccccggaa-ttttttg--gataaaattcaaaa---ctttacttagggatgc---caaaatgtaaacttcatataatggtcatgt--ccataactattcttcttttatttt\n+>bosTau2\n+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------agacattg-ggtaaaattcaaatgcagactagctc----atgatgttaaagaattactcttgtgtggtaatggtcttgtgatagagatagaaatgcttcctta-tttttagataaacacttaagtattta---aggatgaaacgccctgatgtttgtaatttgctttagaatattttagccaaaa----------gaattaa-------------------------tgatgc--aaatatg--caaaaagagta--cgttaaacctaa-----------------------------------------------------atttgCGATTttcattta--------------------------aaaa-tcttcgtgcaacgcacggggctatcaatgt-gggatacagatgtgaacaa---------------------------------------------------------------------------------------------aacggacccgtgtgggactcggcggagcacacagattttgcgggagCACGTTCCCGTTAGGAAGTCTCTGATGCAATACGACCGGTGCCTTCAGGACCTG-TG--AGGCTGACTTTCCTTA-CCCCTCCACACCATCATCAAGGCAGGTGTGATTTTCCAGG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n+>dasNov1\n+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GAGACATTT-GGAGAAATTTG-----------Aatt--tcatgatgttaaggaattacttttgtatgatgatggtcttgtggctat-gtagaatttcttccgtg-ttttaagacgcatgctgaagcatgta---aggataaaatgtcgtggtgtttgtaatttattctaaaacattttagccaaaaacaaataaataaataaa-------------------------tgaagc--aaatatgggggaaatgtttaattgttaaatctagatttaacacggtatataccgtgcttcattatactagtctctacttttccatgtgtttgaaattttCATTAAAATGTTTGTTTGTTGTCTGTTTTAATGAAATCCTTTGTGCTAGCCACTGGGATGAAAGCTAGGGAACACAGCAGTGAGCAA-----------------------------------------------------------------------------------------------CAGCCTGGCTCCGT-CC--GGGGGCCGCTCAGCAGCTC-GGGAGCGTGGAGACG---GGAAGTCTGTCACGCGATGCG-----------CTGGGCCCG------------CTGTTCCCGCCCCCCTCC---CCCC----------------TTTCCCAAGTTTTAAA------AATTTACCTTCCCAGTGGCGGTGAATCCGGAGGAATACGGAAACTGGGGC-GCACTACCATGACACGTGTCAAA-AATCAGTTCCGTGGTCCGTGGAGGGCCTGGGGTTC------GAAAATCTTGTCC-CGAGCACCCCCGTGCGCCTGGCACCGCGACAGTGACAGGACTGAAGCGTG-GACGGCCAG-ACCTCTGCCCTCGGCTAAATCTCGCGGAGAAGGTGGAACA-GGTTACCCAAACCCGACCGAG-GAGGCGAGTTG---GAAACGGCGACTGGGACAAGCTCCCTCA---GAGACGGAGAGAGACCCCA-GTGGAAGGGGGGAGAGGCTCTTAGGGAAACGATGGGGGGACCCGCCCGCACCCGCACAGAGGCGCTGGCAGGCACAGCGGCCCCGAGGAGCCC------AGGAGC-AGGGC-TGTGT-TCCCCTGCATcaggggttggcaaactttttctgcaaagggccagatagtaaatattttaggctttgcaaaccaagaagtagaaagggaggcc-attatgtacgtatttatatagcaagagagaacattt---cccacaatttttttattgacagaatttaaaacttctttattgatgaacaccaaagaaacttgaatttcatataattttcacatatcacgaagtatttttttttt-----\n'
b
diff -r 000000000000 -r 2126e1b833a2 utils/maf_utilities.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/maf_utilities.py Mon May 19 12:33:30 2014 -0400
[
b'@@ -0,0 +1,601 @@\n+#!/usr/bin/env python\n+"""\n+Provides wrappers and utilities for working with MAF files and alignments.\n+"""\n+#Dan Blankenberg\n+import pkg_resources; pkg_resources.require( "bx-python" )\n+import bx.align.maf\n+import bx.intervals\n+import bx.interval_index_file\n+import sys, os, string, tempfile\n+import logging\n+from copy import deepcopy\n+\n+assert sys.version_info[:2] >= ( 2, 4 )\n+\n+log = logging.getLogger(__name__)\n+\n+\n+GAP_CHARS = [ \'-\' ]\n+SRC_SPLIT_CHAR = \'.\'\n+\n+def src_split( src ):\n+    fields = src.split( SRC_SPLIT_CHAR, 1 )\n+    spec = fields.pop( 0 )\n+    if fields:\n+        chrom = fields.pop( 0 )\n+    else:\n+        chrom = spec\n+    return spec, chrom\n+\n+def src_merge( spec, chrom, contig = None ):\n+    if None in [ spec, chrom ]:\n+        spec = chrom = spec or chrom\n+    return bx.align.maf.src_merge( spec, chrom, contig )\n+\n+def get_species_in_block( block ):\n+    species = []\n+    for c in block.components:\n+        spec, chrom = src_split( c.src )\n+        if spec not in species:\n+            species.append( spec )\n+    return species\n+\n+def tool_fail( msg = "Unknown Error" ):\n+    print >> sys.stderr, "Fatal Error: %s" % msg\n+    sys.exit()\n+\n+#an object corresponding to a reference layered alignment\n+class RegionAlignment( object ):\n+\n+    DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )\n+    MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed\n+\n+    def __init__( self, size, species = [] ):\n+        assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE )\n+        self.size = size\n+        self.sequences = {}\n+        if not isinstance( species, list ):\n+            species = [species]\n+        for spec in species:\n+            self.add_species( spec )\n+\n+    #add a species to the alignment\n+    def add_species( self, species ):\n+        #make temporary sequence files\n+        self.sequences[species] = tempfile.TemporaryFile()\n+        self.sequences[species].write( "-" * self.size )\n+\n+    #returns the names for species found in alignment, skipping names as requested\n+    def get_species_names( self, skip = [] ):\n+        if not isinstance( skip, list ): skip = [skip]\n+        names = self.sequences.keys()\n+        for name in skip:\n+            try: names.remove( name )\n+            except: pass\n+        return names\n+\n+    #returns the sequence for a species\n+    def get_sequence( self, species ):\n+        self.sequences[species].seek( 0 )\n+        return self.sequences[species].read()\n+\n+    #returns the reverse complement of the sequence for a species\n+    def get_sequence_reverse_complement( self, species ):\n+        complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]\n+        complement.reverse()\n+        return "".join( complement )\n+\n+    #sets a position for a species\n+    def set_position( self, index, species, base ):\n+        if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." )\n+        return self.set_range( index, species, base )\n+    #sets a range for a species\n+    def set_range( self, index, species, bases ):\n+        if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) )\n+        if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." )\n+        if species not in self.sequences.keys(): self.add_species( species )\n+        self.sequences[species].seek( index )\n+        self.sequences[species].write( bases )\n+\n+    #Flush temp file of specified species, or all species\n+    def flush( self, species = None ):\n+        if species is None:\n+            species = self.sequences.keys()\n+        elif not isinstance( species, list ):\n+            species = [species]\n+        for spec in species:\n+            self.sequences[spec].flush()\n+\n+class GenomicRegionAlignment( R'..b'   for c in block.components:\n+        if c.src == src:\n+            yield c\n+\n+def get_components_by_src( block, src ):\n+    return [ value for value in iter_components_by_src( block, src ) ]\n+\n+def iter_components_by_src_start( block, src ):\n+    for c in block.components:\n+        if c.src.startswith( src ):\n+            yield c\n+\n+def get_components_by_src_start( block, src ):\n+    return [ value for value in iter_components_by_src_start( block, src ) ]\n+\n+def sort_block_components_by_block( block1, block2 ):\n+    #orders the components in block1 by the index of the component in block2\n+    #block1 must be a subset of block2\n+    #occurs in-place\n+    return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) )\n+\n+def get_species_in_maf( maf_filename ):\n+    species = []\n+    for block in bx.align.maf.Reader( open( maf_filename ) ):\n+        for spec in get_species_in_block( block ):\n+            if spec not in species:\n+                species.append( spec )\n+    return species\n+\n+def parse_species_option( species ):\n+    if species:\n+        species = species.split( \',\' )\n+        if \'None\' not in species:\n+            return species\n+    return None #provided species was \'\', None, or had \'None\' in it\n+\n+def remove_temp_index_file( index_filename ):\n+    try: os.unlink( index_filename )\n+    except: pass\n+\n+#Below are methods to deal with FASTA files\n+\n+def get_fasta_header( component, attributes = {}, suffix = None ):\n+    header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() )\n+    for key, value in attributes.iteritems():\n+        header = "%s%s=%s|" % ( header, key, value )\n+    if suffix:\n+        header = "%s%s" % ( header, suffix )\n+    else:\n+        header = "%s%s" % ( header, src_split( component.src )[ 0 ] )\n+    return header\n+\n+def get_attributes_from_fasta_header( header ):\n+    if not header: return {}\n+    attributes = {}\n+    header = header.lstrip( \'>\' )\n+    header = header.strip()\n+    fields = header.split( \'|\' )\n+    try:\n+        region = fields[0]\n+        region = region.split( \'(\', 1 )\n+        temp = region[0].split( \'.\', 1 )\n+        attributes[\'species\'] = temp[0]\n+        if len( temp ) == 2:\n+            attributes[\'chrom\'] = temp[1]\n+        else:\n+            attributes[\'chrom\'] = temp[0]\n+        region = region[1].split( \')\', 1 )\n+        attributes[\'strand\'] = region[0]\n+        region = region[1].lstrip( \':\' ).split( \'-\' )\n+        attributes[\'start\'] = int( region[0] )\n+        attributes[\'end\'] = int( region[1] )\n+    except:\n+        #fields 0 is not a region coordinate\n+        pass\n+    if len( fields ) > 2:\n+        for i in xrange( 1, len( fields ) - 1 ):\n+            prop = fields[i].split( \'=\', 1 )\n+            if len( prop ) == 2:\n+                attributes[ prop[0] ] = prop[1]\n+    if len( fields ) > 1:\n+        attributes[\'__suffix__\'] = fields[-1]\n+    return attributes\n+\n+def iter_fasta_alignment( filename ):\n+    class fastaComponent:\n+        def __init__( self, species, text = "" ):\n+            self.species = species\n+            self.text = text\n+        def extend( self, text ):\n+            self.text = self.text + text.replace( \'\\n\', \'\' ).replace( \'\\r\', \'\' ).strip()\n+    #yields a list of fastaComponents for a FASTA file\n+    f = open( filename, \'rb\' )\n+    components = []\n+    #cur_component = None\n+    while True:\n+        line = f.readline()\n+        if not line:\n+            if components:\n+                yield components\n+            return\n+        line = line.strip()\n+        if not line:\n+            if components:\n+                yield components\n+            components = []\n+        elif line.startswith( \'>\' ):\n+            attributes = get_attributes_from_fasta_header( line )\n+            components.append( fastaComponent( attributes[\'species\'] ) )\n+        elif components:\n+            components[-1].extend( line )\n+\n'
b
diff -r 000000000000 -r 2126e1b833a2 utils/odict.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/odict.py Mon May 19 12:33:30 2014 -0400
[
@@ -0,0 +1,85 @@
+"""
+Ordered dictionary implementation.
+"""
+
+from UserDict import UserDict
+
+class odict(UserDict):
+    """
+    http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/107747
+
+    This dictionary class extends UserDict to record the order in which items are
+    added. Calling keys(), values(), items(), etc. will return results in this
+    order.
+    """
+    def __init__( self, dict = None ):
+        self._keys = []
+        UserDict.__init__( self, dict )
+
+    def __delitem__( self, key ):
+        UserDict.__delitem__( self, key )
+        self._keys.remove( key )
+
+    def __setitem__( self, key, item ):
+        UserDict.__setitem__( self, key, item )
+        if key not in self._keys:
+            self._keys.append( key )
+
+    def clear( self ):
+        UserDict.clear( self )
+        self._keys = []
+
+    def copy(self):
+        new = odict()
+        new.update( self )
+        return new
+
+    def items( self ):
+        return zip( self._keys, self.values() )
+
+    def keys( self ):
+        return self._keys[:]
+
+    def popitem( self ):
+        try:
+            key = self._keys[-1]
+        except IndexError:
+            raise KeyError( 'dictionary is empty' )
+        val = self[ key ]
+        del self[ key ]
+        return ( key, val )
+
+    def setdefault( self, key, failobj=None ):
+        if key not in self._keys:
+            self._keys.append( key )
+        return UserDict.setdefault( self, key, failobj )
+
+    def update( self, dict ):
+        for ( key, val ) in dict.items():
+            self.__setitem__( key, val )
+
+    def values( self ):
+        return map( self.get, self._keys )
+
+    def iterkeys( self ):
+        return iter( self._keys )
+
+    def itervalues( self ):
+        for key in self._keys:
+            yield self.get( key )
+
+    def iteritems( self ):
+        for key in self._keys:
+            yield key, self.get( key )
+
+    def __iter__( self ):
+        for key in self._keys:
+            yield key
+
+    def reverse( self ):
+        self._keys.reverse()
+
+    def insert( self, index, key, item ):
+        if key not in self._keys:
+            self._keys.insert( index, key )
+            UserDict.__setitem__( self, key, item )