Repository 'msp_blastparser_and_hits'
hg clone https://toolshed.g2.bx.psu.edu/repos/drosofff/msp_blastparser_and_hits

Changeset 1:1964514aabde (2015-09-14)
Previous changeset 0:69ea2a13947f (2015-06-21) Next changeset 2:bb0d4cd765c5 (2015-09-29)
Commit message:
planemo upload for repository https://bitbucket.org/drosofff/gedtools/ commit 1cc2b50091f512593c502176619998f5908fc8e8
modified:
BlastParser_and_hits.py
BlastParser_and_hits.xml
added:
test-data/al_sequences.fa
test-data/un_sequences.fa
b
diff -r 69ea2a13947f -r 1964514aabde BlastParser_and_hits.py
--- a/BlastParser_and_hits.py Sun Jun 21 14:31:29 2015 -0400
+++ b/BlastParser_and_hits.py Mon Sep 14 12:18:46 2015 -0400
[
@@ -14,6 +14,11 @@
     the_parser.add_argument('--tabularOutput', action="store", type=str, help="tabular output file of blast analysis")
     the_parser.add_argument('--flanking', action="store", type=int, help="number of flanking nucleotides added to the hit sequences") 
     the_parser.add_argument('--mode', action="store", choices=["verbose", "short"], type=str, help="reporting (verbose) or not reporting (short) oases contigs")
+    the_parser.add_argument('--filter_relativeCov', action="store", type=float, default=0, help="filter out relative coverages below the specified ratio (float number)")
+    the_parser.add_argument('--filter_maxScore', action="store", type=float, default=0, help="filter out maximum BitScore below the specified float number")
+    the_parser.add_argument('--filter_meanScore', action="store", type=float, default=0, help="filter out maximum BitScore below the specified float number")
+    the_parser.add_argument('--al_sequences', action="store", type=str, help="sequences that have been blast aligned")
+    the_parser.add_argument('--un_sequences', action="store", type=str, help="sequences that have not been blast aligned")
     args = the_parser.parse_args()
     if not all ( (args.sequences, args.blast, args.fastaOutput, args.tabularOutput) ):
         the_parser.error('argument(s) missing, call the -h option of the script')
@@ -122,12 +127,14 @@
         leftCoordinate = 1
     return getseq (fastadict, FastaHeader, leftCoordinate, rightCoordinate, polarity)
     
-def outputParsing (F, Fasta, results, Xblastdict, fastadict, mode="verbose"):
+def outputParsing (F, Fasta, results, Xblastdict, fastadict, filter_relativeCov=0, filter_maxScore=0, filter_meanScore=0, mode="verbose"):
     F= open(F, "w")
     Fasta=open(Fasta, "w")
     if mode == "verbose":
         print >>F, "# SeqId\t%Identity\tAlignLength\tStartSubject\tEndSubject\t%QueryHitCov\tE-value\tBitScore\n"
         for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True):
+            if results[subject]["RelativeSubjectCoverage"]<filter_relativeCov or results[subject]["maxBitScores"]<filter_maxScore or results[subject]["meanBitScores"]<filter_meanScore:
+                continue
             print >> F, "#\n# %s" % subject
             print >> F, "# Suject Length: %s" % (results[subject]["subjectLength"])
             print >> F, "# Total Subject Coverage: %s" % (results[subject]["TotalCoverage"])
@@ -149,6 +156,8 @@
     else:
         print >>F, "# subject\tsubject length\tTotal Subject Coverage\tRelative Subject Coverage\tMaximum Bit Score\tMean Bit Score"
         for subject in sorted (results, key=lambda x: results[x]["meanBitScores"], reverse=True):
+            if results[subject]["RelativeSubjectCoverage"]<filter_relativeCov or results[subject]["maxBitScores"]<filter_maxScore or results[subject]["meanBitScores"]<filter_meanScore:
+                continue
             line = []
             line.append(subject)
             line.append(results[subject]["subjectLength"])
@@ -164,14 +173,33 @@
     F.close()
     Fasta.close()
         
-    
+def sort_sequences (fastadict, blastdict, matched_sequences, unmatched_sequences):
+    '''to output the sequences that matched and did not matched in the blast'''
+    blasted_transcripts = []
+    for subject in blastdict:
+        for transcript in blastdict[subject]:
+            blasted_transcripts.append(transcript)
+    blasted_transcripts = list( set( blasted_transcripts))
+    F_matched = open (matched_sequences, "w")
+    F_unmatched = open (unmatched_sequences, "w")
+    for transcript in fastadict:
+        if transcript in blasted_transcripts:
+            print >> F_matched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript]) )
+        else:
+            print >> F_unmatched, ">%s\n%s" % (transcript, insert_newlines(fastadict[transcript]) )
+    F_matched.close()
+    F_unmatched.close()
+    return
 
 def __main__ ():
     args = Parser()
     fastadict = getfasta (args.sequences)
     Xblastdict = getblast (args.blast)
+    sort_sequences (fastadict, Xblastdict, args.al_sequences, args.un_sequences)
     results = defaultdict(dict)
     for subject in Xblastdict:
         results[subject]["HitDic"], results[subject]["subjectLength"], results[subject]["TotalCoverage"], results[subject]["RelativeSubjectCoverage"], results[subject]["maxBitScores"], results[subject]["meanBitScores"]  = subjectCoverage(fastadict, Xblastdict, subject, args.flanking)
-    outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict, args.mode)
+    outputParsing (args.tabularOutput, args.fastaOutput, results, Xblastdict, fastadict,
+                  filter_relativeCov=args.filter_relativeCov, filter_maxScore=args.filter_maxScore,
+                  filter_meanScore=args.filter_meanScore, mode=args.mode)
 if __name__=="__main__": __main__()
b
diff -r 69ea2a13947f -r 1964514aabde BlastParser_and_hits.xml
--- a/BlastParser_and_hits.xml Sun Jun 21 14:31:29 2015 -0400
+++ b/BlastParser_and_hits.xml Mon Sep 14 12:18:46 2015 -0400
b
@@ -1,4 +1,4 @@
-<tool id="BlastParser_and_hits" name="Parse blast output and compile hits" version="2.1.0">
+<tool id="BlastParser_and_hits" name="Parse blast output and compile hits" version="2.3.0">
 <description>for virus discovery</description>
 <requirements></requirements>
 <command interpreter="python">
@@ -9,6 +9,15 @@
  --fastaOutput $fastaOutput
  --flanking $flanking
  --mode $mode
+ ## Additional parameters.
+    #if $additional_filters.use_filters == "yes":
+        --filter_relativeCov $additional_filters.filter_relativeCov
+        --filter_maxScore $additional_filters.filter_maxScore
+        --filter_meanScore $additional_filters.filter_meanScore
+    #end if
+    --al_sequences $al_sequences
+    --un_sequences $un_sequences
+
 </command>
 <inputs>
  <param name="sequences" type="data" format="fasta"  label="fasta sequences that have been blasted" />
@@ -18,10 +27,25 @@
      <option value="verbose" default="true">verbose</option>
      <option value="short">do not report oases contigs</option>
  </param>
+    <conditional name="additional_filters">
+            <param name="use_filters" type="select" label="Use Additional Filters?">
+                <option value="no">No</option>
+                <option value="yes">Yes</option>
+            </param>
+            <when value="no">
+            </when>
+            <when value="yes">            
+                <param name="filter_relativeCov" type="float" value="0" max="1" label="Minimum Relative Subject Coverage" help=""/>
+                <param name="filter_maxScore" type="float" value="0" label="Minimum maximum BitScore" help=""/>
+                <param name="filter_meanScore" type="float" value="0" label="Minimum mean BitScore" help=""/>
+            </when>
+    </conditional>
 </inputs>
 <outputs>
  <data name="tabularOutput" format="tabular" label="blast analysis, by subjects"/>
  <data name="fastaOutput" format="fasta" label="hits"/>
+ <data name="al_sequences" format="fasta" label="Blast aligned sequences"/>
+ <data name="un_sequences" format="fasta" label="Blast unaligned sequences"/>
 </outputs>
 
   <tests>
@@ -29,9 +53,12 @@
         <param ftype="fasta" name="sequences" value="input.fa" />
         <param ftype="tabular" name="blast" value="blast.tab" />
         <param name="flanking" value="5" />
+        <param name="use_filters" value="no" />
         <param name="mode" value="verbose" />
         <output name="tabularOutput" ftype="tabular" file="output.tab" />
         <output name="fastaOutput" ftype="fasta" file="output.fa" />
+        <output name="al_sequences" ftype="fasta" file="al_sequences.fa" />
+        <output name="un_sequences" ftype="fasta" file="un_sequences.fa" />
     </test>
   </tests>
 
@@ -39,7 +66,7 @@
 
 **What it does**
 
-Parse blast outputs for viruses genome assembly. Outputs analysis and hit sequences for further assembly
+Parse blast outputs for viruses genome assembly. Outputs analysis and hit sequences for further assembly. Output also the contig sequences which have or not been blast aligned with the indicated cut-off
 
 </help>
 </tool>
b
diff -r 69ea2a13947f -r 1964514aabde test-data/al_sequences.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/al_sequences.fa Mon Sep 14 12:18:46 2015 -0400
b
b'@@ -0,0 +1,1228 @@\n+>Locus_81_Transcript_1/1_Confidence_1.000_Length_155\n+GAGGTCCCTGCGCACGTTGTTCGGTGCGTCCTGATAGCTCATTCCCATTGATTAGCGCGA\n+GTTTGCTTGGCATCGTTAAGCTTAGGTAACGATTCATCCACGGAGCCCGGCATAACGCCG\n+TCTTAATCTTCTCCCAGCATGATCCGTGTAACTCG\n+>Locus_10_Transcript_6/8_Confidence_0.333_Length_1121\n+CGTTATCCAGCTGGATTATGTTGTTATTGTGTAGAAAATATAGGAGTCTCGAAGACTCTT\n+TAGAAAGCATACATGTAGTTGCCTCCCTCGTTCTGGATGTACTGAACGGCCTGCTTGCAT\n+ACTGATGTTAAGTGCAGAGCTATGTTTTCTCTTCCTCGCCAAAAAGAAAAACCACTTTTG\n+ACGACTATCTTACTTATCCAGGGTGTTAGTATTGGATTCAATATCCCTGTTGGTTCAAGA\n+GGGACTTTACCTGTTGCAATGTTAATTGCATCAGTCCTATGCCATCCTTTCCTTTCATAT\n+TCCGTCATAGTTTTGTAAGCTTCAGGATACCATTTGGCCAAACATTCTTTTAGACTAGGT\n+ATCTCAATCTTAGCTTTCTTCGCGATTTGATGTTGTCTTATAAATTCGCTGAATATTGGC\n+ATTCCGTTTGGACCAGCCTCATGAGATTGAATCATACGTCTAGGCCAAATGTTATTTTCG\n+CACGTTGGTCCTTCTGTTTTGTAAAACCTGTATGTGGGTACAACCGTCTTCTCCCATACC\n+GGTTTCAACTTCCTTAAATTTTTGATATAATTGCTGGAAATGTGTTTTTGCGGTCCAGGA\n+ATGTCGTCGGCTGCTATTTTCATTGAAAATTCGACTTGTTGATAAGCCGTCTTTTCCTCA\n+TCTGTCATTTGCTTCCACGTCAAGTTGATGTTTTCACTAAACAGTCCAGGAACATCGAAC\n+GTTGGGATGTCAGCAAGTGGTAGCTTGCTAGTAGTTTTCCACCCTTTCCATTCGTATATT\n+CCGAAACCACCCAGTCGTTTAGGCAGGTGCAGCCAGTGATAGCTCTGTCCCGTGTATTTA\n+CTCCACTTAATTTTGTTTGCTTGATGTAACCAATCCAATTGTTTCTTGCTTCTCCTTTCT\n+AATAAATAGATATTATTAGCGGTTGTCTCAACTTGTTGATTTGCAGTCCATGGTTGTGGG\n+TTCCACGGTTTCCTTTGTGTCACGCTAGGGATAGCCCGATTTGTCCAGCCTCTGACGCCT\n+TGAGTAGATATTTCTGTCCTTAAGAATTCGCAACAATTTTGCATAATTCCAAACTTGCTG\n+TTTTCTCCAACGGCGTTGATTGCCTGATAGCTATACCTAAA\n+>Locus_20_Transcript_1/1_Confidence_0.000_Length_191\n+CCCCCCCCACCTTCGCAATCACAGACAAATTAAACGTAAAACAAACGACATGGATGCATC\n+CAACCCTATTATTTCAAGCGATAGGAGCACTGTGAATCAGGTTTATGCGAAGATGGTGAG\n+GAGAAAACGGCAAGATCTTGCCTATCTCCAGGAGTTGGTGAGGACAGCCCATCGAGAGAT\n+TCTCCCCCAAG\n+>Locus_1_Transcript_5/7_Confidence_0.231_Length_163\n+AATAATGAGGCGGAATCGCCGTCGTGGACGTGGCAAGGCAAAGCTAACGGTAAGCGAAGG\n+AAGGGCCCCCGAGGAGGCAAAATCAGCACTCGAAGAACGCCTCCGAAAGCTGGAGCTCAG\n+CCACAGCCTTCCAACAACCGGAAGTGACCCCCCACCCGCTAAA\n+>Locus_15_Transcript_1/1_Confidence_0.000_Length_436\n+AGCCAGAGCATCCTGTCTAGCACCCCTCTGCGTCCCTGGTGCATCTCGGTGTACATTCTT\n+AACTTACCTATTTAATCACCCATGCGAGAACGATACTACGAAGATACTGAACTCAATAAC\n+CTCAGCTACAGTGAACAAGATAAACAGCTAAAACAAATATGTCAAACGAGTACCTTAGAT\n+CAATCCTCATGCCTGAGAGAGGTCCCTCCAGTATACCAGACGACAACGTCCGCCGTCATT\n+GCGTACGACAAGAAACAATCACTGCGAACATTGTGGTCGGATCTTCTGGAAAAGGGGCCT\n+TTGTTCTGTTCCCTAACAATCCTAGCAGCCTTATTGGCGCTCATTTTAAGTATGATGACC\n+AGGGCAAATCTTACAAATACACCCAATCACTCGTTGTCGCCCAACGCCTCAATGAGTCCT\n+ATAATTACGGAAGAAA\n+>Locus_58_Transcript_1/2_Confidence_0.333_Length_476\n+CCGAGTCTCCTCGGAGATTGCCTTTGCAGCGTTTCTTGGTATCGGTTGGTCCAACTCTTC\n+CAGGAATGGGTTCGTGTTCATGAATGGGTTCATGCTCGCGGACCCGGCTGTCCAGTGCGG\n+GAGGGTCTGTTTTGGGGTTCTTCTCTTTACGTCTCCCATATCTTTTATCTTTTTGTCCAG\n+AGATGTCAGGGTCTTTACTTGAACTAATCCTGGTGAGTTCCCAAGTAGTGGAATCCCAAG\n+AGAGTCCGTCAATTGGAGTTTCAGGTTGATGCCAAAAACTTCTCCCACACTGGACCCGAC\n+AACGGATCCTGTGAAAGCTGCAAATGGTGCCTTCCCCAAAACTCCACTCTTTAGTAAGTC\n+GTCTGCTGCATAAATGGCAAGCTGGTGGGACGTTCCCGTTACCTTGTTTGGTGATGTCGA\n+GGATGCGAGTCTTCCTTCTAATAATTGATAGCATGGTAGGAGGATGTAATCGCTTC\n+>Locus_48_Transcript_1/1_Confidence_0.000_Length_372\n+ATTGACGCGGCTCACCTCCGATTATCTCGAAAACCGCCCTAAGTTGTAGTTGTTGGGACG\n+GCTTGTTCAGACGCTCAATCGTTTCCTCGATTGACAATGGGTCAAGGTTGTTGAAAGGCC\n+CGTTCATCAACCGTACGAAAGTGTCGGCGATCCTCGCGATACGGTCGCTTGGTTTCTTGT\n+CGTTGGCGACAAAGGTTACCCTGCGTTCAATTGATTCAGACATTGTCTCCCAGCGTTTAA\n+TCATTGGCATCATCATACAGTCACTCACGATAGGCAAGGTGTATTGGCGTGCGCTCACTT\n+CTGGTACATCTGCGTCACTGGTTACTGGCCAATGGACGCGTGGCATTGTAGGTTTATACA\n+CAGTTGGACTAA\n+>Locus_4_Transcript_4/5_Confidence_0.333_Length_1170\n+GTCTTTTTCTTTCTCATTTGGTTCATGCAATTTTTCACCTGCGCAGGTATTTTTCTGTCT\n+TGTTGTTGATCTTGTTGTTGCCTTTGGTATCTGTCTATTTATTGCTTTAAGTTCTCCGAT\n+TGTTATTCGGGACAATGCGTCTGCGACATGATTATCCCTCCCCTTGAGATATTCTACTGT\n+GAATTCAAACTCCTCCAAGTCTAGTCTCATTCTGGTTAATTTTGAACTGGGGTTTCTCAT\n+TGAAAAGAGATGTGAACGCGATTAAGACTCATGGGAATGGTTTTGGCATCATGAAACCTG\n+ATAGTAGTTGGGAAATTGCTCCACCTCAACCCAAGGAAAAATACCTCAGATATTACGCAA\n+ATGGTGAATTCGTTGATATGAAAAACCTCGTTAACGAGAAACACCCCGTCATCGTTAACG\n+ATTATTGTGAATTTGCCCTGGAACATGAAATGTATCGTATTCTCCAACCTATGGACCCTT\n+CCAATTTTGCACCTCCACGGGAAACGGGAACAA'..b'AGAAAATTTAAGACAGAAGTACTCGATGAGCTTTGGGGAAGTGGTTGGC\n+AGGAACGAAATAAGATGAACCAGTATGAATGGTTGTCTTACTGCTGGGCAAATAATGTCA\n+CTAAGGTCGATACTCAAACCGTGCTTCTATCTTATGACATCAAATGGCAACAACTACCTG\n+CTGATATGAAAATGGCTATTCTCGGCGATTCGCGAGCTGATCTTGAAGCTCAAAAAACTC\n+ACAATAAAGTGATGCATGCATACAACGGTAACCCTTTGTGTCAGGGATTTCAAGAAGTTG\n+AAGCTTCAAAAACCTTCCTCAACATCGCGGAAGAGAGTAATTCAGTTCTGAAACCATATA\n+CTGGACTGGAAGCTGAGAAATACATCACCAACATTGTAGGAGACATGAATCCGAATCAAT\n+CAAGGATCTTCGATCAGGACAGGCTTAGAGGTAACCAATACAATGCCAATGGGGCTGTGG\n+TTCATAATGCTGTATCAACTATTCCGTTTACAAACCTCATTCCTAGGACGATTCGATCTG\n+ATGATGACGTCCTTGAGAAGTCGGCCAACAGATTACAGGTTACAGAGACAAACGTTACGG\n+ATTACTACGTTAATCCGATTGAGCCAACTGAATTATCCAAAACAATAAGTGACCAGATCA\n+AAAACAATCAATCATCTAACTGGCGACGAGATAACACGTCATTGGCTGGTTTCAATAGTT\n+TCGACATTGCGACAGTCAACACTGCACTAATTGCAAGAGGTCTAAGCACTGAATCAATGA\n+CTCTCAAGTTAGAGCTATTGCACGGAATAATGGCTATGCAGGTTGAAGCACCAATGATCA\n+ATTCCAGCACTTATTCGATCGTAGATAATCATACAATCCCGACCGTAACTGACAGGGCCG\n+TCATAGGCATCAATGACTCGCCTGTGTTTGGCGAGGACTGTGGTGGTGATCTTCCTGAAT\n+ATCCTTTCGGCGGCGGAACCGGTACAATTGCCTTTCACCTAACATTGCAAACTGTTCCTG\n+AAGAGAGGAGAGATAAGGCAATCTTCTGCCCTCCTGGTTTGTTGCAAGCAGCCCGAGATG\n+GAGCAGAGGCATTGGCCCTATTTGTTTTGTCGATGTCTGAATGGCCTTTCGGTATTTATA\n+CTGTCACCAAGAGAACAACCGATGAGAAGGGACTGAATCCTGCGGATCAGGTTTACGTGC\n+CGATGGAGACCATAACCCGTGTAGGTGGAGATAGAGTATTGGACGTTGTACTTCCTCGAA\n+GGTATGCGGTTGCAAATCCAACGACTCAAGGAAATGCTAATGCTCTAGCAGTTATACAGC\n+CTCAAGCTGGGCCTTTAGATAACGGTGCGGACGGATTAGCCGCTGGTGAATTATTGGATG\n+TCAATTTCATCGGCGCCGACGGCATTACTGAATATCCATTGACATATTACTTGTATACCT\n+GGGCACTTCGATTTGATATAACGACGATTAGGCAATACATTGGTAGAATGGCAGCGTTAA\n+TTGGAGTGAAACACCAACTATGGGCTAGTCATGAAATCAGAGTAGCTTTGTGTCAAGTTG\n+CACCCAAAATGGTGGTCGGAGTTACAGGTTCGGGAGACCTGCCAAGAGGATCAGCCGCTG\n+CAAGCGAGGTATGTTACTCAAGCTTATTGGAGGTCTCACGCTCTGAAGAAGATTTTCCGC\n+TGCTTGGTCAGGTTCAAGCCGATTTCAGAGTCTTTGAAACCAATACAAGCACATGGAATA\n+AAGTAGTTTTGGGATTGGCAACAGCACCAAACGTGACAAGTGAACAAAACATGCATGTAC\n+CATTCGTTGTTGGCGATCCGAGATCTAACGCGTGGGACCGACTCGAAGCAGTACCAATTG\n+CTGCTGCTTGGCAGATGTACTACCATTCAAGGGGCGTAACTACTGCCGCTTGGAATGATG\n+CGTACACTAACGTAAATAACGTTTGGTTGCAAAAGATGGCCCGTGATAGCTTCTCAACGA\n+CCCAAAGCACTGGGACGATACTGCCTGCCAGATATGGTAAGATAGTCAAAAACCTGATGA\n+GAAACATGTTTGAAAGAGAACCTGCCAAAGTAGTAACAAGCGTGGGAGGCGATGAATATG\n+AGATAACCCATTTTGAGCGCTGGTTACCGGGTAATAGATATGCTTCCGTGTTTGAACAAG\n+ATGAAACTGAAGTTAATCTGTTTCCTCCAACTTTATTACCGGATATTTGGGTTCAATATC\n+CAGCGACTCACACCCCAATCATGTGTGCTTCGTTCCCACCCGTTTTCGGCCAAGACTCAA\n+CACAAGGGTTCGGCAAAGAATCACAACTTATACCCTTTCGAAACGCAAACAACAATCTTG\n+TAGCACCATACGTTGAAGCTTTCGTTGCCAATCAAGCTTATTTCCCGATAGGATCGGGCC\n+CAAACATCAACGACAAGGTTCTGTGGAATAGTAGATTGTGGATGACAAGTGGTTTCGTTC\n+AGTACTTGGATTACGCTGGCAACGCAATCAACGAAGTGGTTCCTGCAGCAGGGCTACCTT\n+TGGGTAGGTCAATCCCATTACTGCCAGGGGAAGTTCAGCCGGTTGGTAACACCAACATGA\n+GCACAAGCTGCGTTCCTCGTTATTCTGTGGACGGTCGCCGAATTTTTACTTATGTCAACA\n+CAGCTCAATCCGTTCCTTTGATACAAGCGTGTAATAGAGCCAATAGATTGGCCAGATCGG\n+CATGGCTATTATTGCATGTCTACATCGAACCAGAGTTGCAGCTATTGAGCGATGAAGTGG\n+TGGACATATTCGACCAACTGACAAGCAAGACTTTTTTAGATGTAGCAAAATCGGCTGCGG\n+ACAGTGCGGAGGGCAACATTCCGGCAACGAAGGTATTGACAGACCTCCAGGCAGTGGATT\n+CAGCAACGCTGCCGAGTACTTTGGATCCATCCACAAATATGCTCCAACCAGCTCCTTTAC\n+TCGGCGAACCTACGACAAATTAACTCGATACATTCATGATGGGGTAAGTCGAAGTAGTCA\n+GAGAGAAATTTCATCATTGTCGGATGATTTAGTTAGATTTAGTACTCTCACAGATTTTAC\n+TGTCATCGATTTTATTACTAAGAAATTAGATTTAGAACGTCCGATTAGTACTCAAGGAGA\n+TTTATTGGCGTTAGAACCTAGATGCAAGGGTGACCTCGCTGTAGCAAGACTTAAGATTAA\n+GGACATAATAGGTAGGATTGATAAGGATATTAGAACGTGGTGTGAAGCAAATTTATGCCA\n+TTTGGATGCAATCTTGGTAACGAACTTGATAATCTGGGGACAAATCTGGGGGTTGGAAAT\n+TCTGAAAGCTTTACATTCAACAGGAATATTAAACGATTTCGATACTTTCGCAACAAAAGG\n+AAGCAAGATTAGTGCATTTGTCAAGCGTTTTCCCTTCGACAAGGATGATGCCAAAGCTAG\n+ATGGGCAGAGATCAACACGCTGACGGGCTATTTGCAGAATGATTTTGGAAACTTCGATTA\n+CGATAAGGAATTTGAAGCTTTAGCTACCGGAGATAGTAACCACCCCGCTTGGTGGGAAGA\n+AGTATTCACTAAGAAAATAAAAGAGTTGATGACACATCAAGAACACAAAAAGTATATCAG\n+TTTTGAAAAATATGTTAAGGAAGGATATTGGATAACATCTGGCAGCAGCAGCATCGGCAA\n+AGTCAATTGGTCATATGACGGAGACTTGGGCAAATTTAAGGCTAGAAAGAACATGTTATT\n+AGATCTATATACGCCGGACGAAATCTACAAAATGGCTGTAGAATGGGACGGGAAACTAGA\n+GAATAGAGTGTTCATCAAAGATGAATTGGCAAAGAGAAGATTGGCAGTGGCAAGCAATAT\n+TGAAGCATATTTGAATCAAGGGTATATATTTTATCTATTTG\n'
b
diff -r 69ea2a13947f -r 1964514aabde test-data/un_sequences.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/un_sequences.fa Mon Sep 14 12:18:46 2015 -0400
b
b'@@ -0,0 +1,484 @@\n+>Locus_8_Transcript_1/4_Confidence_0.400_Length_658\n+TTTGCTTTTTTCTTTTATTCTCTGCCTTCCATCTTCACTATGACCATCTCATCATTAAGG\n+CAAATCCGAAGAGGCTCTAAGACAATCGGATCGATTTATGCAAATGATACTCCAATCTTC\n+GAGCGTTTCTTCCTGTGCCTCTCCGCCACCACGAAGAATAGCAACTCGTCATTATGGCAA\n+ACAAAGATCGAAGACCAGTTTTCACCTCAATCTAATGATGTTTGCCTCATTTATCCTGAT\n+CCGCTGTGATCCGCTTGTAATCAATCCCCATCCATCCTCAGCCAGGGGCAAGTTCAAGTT\n+GCCAGCTCTCCGCAGCGATCTCCTCCACGGTTCCTTTATGATCTCGAGTTTCATCCTTTC\n+CACTCCACCTGGCTTGAGTGGGTGGCTCAACATAATGGGGGGATCTGCCGTTATTCGGTT\n+GTATCCCGATCGCAGCCGCAGCTGCAAATAAAATAAGAGTCGACTTCAACACTCGACTTG\n+GCTTAATGCAAAGTGCCGCTGAGGAGAGATGCGTGGGTGGGCCCACATTGGGAAGCCAAC\n+TGGGTCTGCTCTAAGTCGCAGAGTGCAGATAAACACTGCCAAAAAATATGTGACTAAAAT\n+CGAATTTATTTCCTGGGTCGTCGATTAGGAAATAAGAATGAGAGGTAAATGCGCAGTC\n+>Locus_32_Transcript_4/5_Confidence_0.286_Length_242\n+GGTGGTGCAATTTGGAGTCTGGCCGCGCGATTAGCTAAGCGCTGCGACCATATAATCGCA\n+TTGACAACTCCGATTTGTGTGCGGGAGTCGAGGAAGTGCTCCACTTTATGGCAGTCACAA\n+TTTCATGTTTGAATGTTTCGCCTCGATAGGATTCGGAATGGGGATTTTCGAATTGCGATT\n+TGGTCCGCTGGCCTGGCCTGGACTGGCTGGCCGCTTGTCCGATTCTATCGAATGCGTGTC\n+TC\n+>Locus_5_Transcript_3/6_Confidence_0.222_Length_630\n+CGCCTGCATTAAAGCACTCCCACAGATTTACGGTCTCACCTTCCCCCTTCAAAAGATTGC\n+AGGCAACTTCTGCCAAGTGTTCAGCTCCTTTTTAGAGCATCGCAATGGATCTTGGGGCAG\n+CTGAGACTTTTCCGGTATTCCACTCGACAGCAACTTTCATGTTCTCGGCAGATGCGCGGC\n+AAATGATAACATAAAATGCGTTTTCCATTTCGAGGCATAATTTCGTGGCGCGGGCAAAAC\n+CATTTTCCGCATTTTCTCGGCCAATTTTCGTTGCCTCTGCGTGCTGTTCACAACTTGGCG\n+GTGAAAGAGGCACTATGAAAGAATCAAAAGCAGTCAACTCCTGTTCAGCAGCAAACAAAC\n+AGCAAAGAGGCGGGAAAAACAATATTGTATTTTGCGGAAGGGGAAAGGTACGAGGTCCCC\n+CCCGGGCTCCAGAAGTCTGCTGGAGCAATAAAAATAATAATCGAACTGCAGCCGCATTGC\n+GTTGGCCGCAAGGTCATTGCCCATTGGCTCTGATCGATTGGGCCAACAAGGTGGGCAAGA\n+GGGGCGATTCGAGCCAAGAGAAATGAGAGAGGGCAGAAACTGCCGTTGGTTAGCGATGGT\n+AATTTGATAATTGGCAAAACTGTCTGCTCG\n+>Locus_3_Transcript_3/5_Confidence_0.375_Length_130\n+TCCCATCGGCGGGGGATGGGAGGAAGGATGCCGGTGATGCGCCTTACGACACCTTAAGGT\n+GTCATAGACACGGTGAGGTGGCAATTAGGGCGCACCGTGGGGCACCCTTCACCGTCAACC\n+CTTCTTATCC\n+>Locus_8_Transcript_4/4_Confidence_0.200_Length_135\n+GCAACGAATATCTTTAATTACGCTCTATAATCAAATATAGAGAGGGTCTTAAATATTCCC\n+AGAAGAGGCCATTTCAAGTCACTAACAATTTGACTTTGCTTTTTTCTTTTATTCTCTGCC\n+CTCCATCTTCACCAT\n+>Locus_32_Transcript_2/5_Confidence_0.286_Length_124\n+CGCAGGAAAGGTGAAGGTGGTGCAATTTGGAGTCTGGCCGCGCGATTAGCTAAGCGCTGC\n+GACCATATAATCGCATTGACAACTCCGATTTGTGTGCGGGAGTCGAGGAAGTGCTCCACT\n+TTAT\n+>Locus_39_Transcript_1/2_Confidence_0.667_Length_187\n+ATTAAAACAGAGGAGACAGATGGCCCTTGCGAGCGTTGACTGTCTCTGATTTCTGCCCAG\n+TGCTCTGAATGTCAAAGTGAAGAAATTCAAGTAAGCGCGGGTCAACGGCGGGAGTAACTA\n+TGACTCTCTTAAGGTAGCCAAATGCCTCTTCATCTTGGAGACCAGCTGCGGATATTGGTA\n+CGGCCTG\n+>Locus_11_Transcript_2/2_Confidence_0.333_Length_1110\n+GCTTATCTTTCATATGGTCCATAGTAAATACATTAAAAACTAACGACCTTGGGACACGGC\n+CGCTGACACTGCGAAGGATGTCAAAAAAAAGAACCTGATTTTTAATTTCGACTGGACGAC\n+AGTTGAAATAGTGGGAGCGGTTTCCATCCCAGTATTCAAAACGGCGGTTTGTGATGATGT\n+TTACATAATCCTCAAATTTGTGCTCGTATCCTGCTTGTGTGTCGTTCTCAAACCAAAAGC\n+GTATAAAGACAGTGCCGGAACGCTTGAATTTCTTGAAATAACATTCCTGTTCTGAAAGGT\n+ATCCGTCGGTTTCGACAAGCACTACAGGATCAAAAATCATACACCCATATGCGATATTTG\n+CTCGTGAAATATGCATTATTTTTCCAATGTCTGGTGTAGAAATGTCATAGATAGAATGCA\n+AGAACATCAATCGTTCTGAAGTTACTGTACAATATTGAGCCGGTTTACGGCAAATGACTT\n+TCGGGTCTTCATTGAGATGCAGTTTTAGAACTTCCGAATGGTGTTTGGTGAGACCCTCTT\n+GTTTGTATGATCGAAGTTCAAATAAGTAACGACTGTGACGGTAATCGTCGTTAGCATCGA\n+GTAAGGGGTAACATGTATGGGTATATTTCTCTGCTGCATTCAGATGGGTCAAAGGGTTAC\n+CCCCAATATCTTTTAGTACACAATCAAATCCGTTGATTGGTACAGTGCCTCGTTCAATCC\n+TCATTCGATACTTCATGTAACGTTCGCTCAATAAGCGATGTGCACGTGCGAATGCATGAG\n+GTGCACTATCCGTAGCTTTCGAATAATCTAGGTTGAACATGCTGAATGATTTATTCAGTT\n+TGGATTCTTGATTCGATGTGAGATGTTGCTTGATGTATACTTTTTGGAAGTTGCGCTTCG\n+CAAGTATTTCTTGCTTAGCTTTGCAAATTCTTTTGTGTGCATAGGCTTCTTGGAACAATG\n+CATCACTGCCCAGACTCTTTGCAAGTTGAGCTTGTGTGACAGCATGTGTTGATGGAAATT\n+CAGAAGCGACGTTATCAAGAACAAGGTCAATTGCGGCACTATTTTCAATCGAATCAAAAG\n+TGACGTCTGAACGATCAACAGACGTCAAAA\n+>Locus_25_Transcript_2/3_Confidence_0.200_Length_363\n+ATTACTTGAATTTTTCAACACTTCAATACAAGATTCCTGACAACCACATTGTACCTTCTC\n+CACTAAGTGGTTGCAATCCTATGGTAATCGACGATCAACTCTTTAACAAGCACATGAACT\n+GATGGACTAACTAATGGGCAAGCAAAGTGTTTTAGGCAT'..b'GGCATTTTGTTTTGCATTATCCAAGATTCTGTGC\n+TGGCGCAAACAACAAGCAACTCATCTGCCACTTTTCACACCCTTGAAGCGGAGTTTTCAC\n+TTCTCTTTGGGTGAGCTTTAGCCGCAAATGGAGGAGCCGCCTCATAAATCATCGCCCAGG\n+CCACCGGTGGGCGTCGTCCAACGTCCAATGCTAGAAGTGTGCGCAGGCGCAGCTGCTGCT\n+CACCCCCTCCTCCCCTTAATTTCCCCAACTCATTGTCAGCTGCAAAGGTGCCAAAGAAGT\n+GTACAAACTTCGCGACTGATTTGGGGGGATTTGGCCAGCCGTA\n+>Locus_6_Transcript_1/3_Confidence_0.400_Length_598\n+AAATGCGCAGTCTGCTCCAATATCTTATGGTTTGAAACAAAACAAAAGCAGTATACCATC\n+GACTATATATTAAGTAGATAAAACTATTTTCCAGTGAGAGATTGTGCTTCATTAACACTC\n+GCCGCTCGTTCGCCTGAGTTTCCTTTTGTTTGCCGCAGTTTTTGACACCAAACTTGGCGT\n+CACTTCAAAGCCCTGGCGATTAATCTTGAGTGCTGGTGGGGAAGGAGTGGGGCTTCTCCA\n+GTGCCAGATCTTCCAGATCCGGTCGTGTCTGCAATTTGCAGGCATTTTGTTTTGCATTAT\n+CCAAGATTCTGTGCTGGCGCAAACAACAAGCAACTCATCTGCCACTTTTCACACCCTTGA\n+AGCGGAGTTTTCACTTCTCTTTGGGTGAGCTTTAGCCGCAAATGGAGGAGCCGCCTCATA\n+AATCATCGCCCAGGCCACCGGTGGGCGTCGTCCAACGTCCAATGCTAGAAGTGTGCGCAG\n+GCGCAGCTGCTGCTCACCCCCTCCTCCCCTTAATTTCCCCAACTCATTGTCAGCTGCAAA\n+GGTGCCAAAGAAGTGTACAAACTTCGCGACTGATTTGGGGGGATTTGGCCAGCCGTAC\n+>Locus_22_Transcript_1/1_Confidence_0.000_Length_312\n+ATCGAAAAATTTGTCAAATCTGCTAAGAAAAAACTAACAGATGAATGTTTCCCAGTCGAA\n+GCTTGCAACGAACATGAACCTGAATTCGATGAATCAGATTTAGGTACAGGAATAACCTAT\n+TCACCCATTTATGCAGTCGTCAAAGTACAAAAATGTGAACTTCCTGCAACTCCCGTGCCG\n+TTTGATGAACCTGTCGAGAAGGATAAACCAGATACAGAAAGGATAGAGGTTGGTGATATA\n+CGCAAATCTATGGATGAATTCACACGCTACCTTAAATTCACACATGATTCAGAAATTAAC\n+AATATGAAATCA\n+>Locus_37_Transcript_2/3_Confidence_0.400_Length_145\n+TTATTTAGGTCACAAGTGTACTGACAAAGGTATATTGCCAGATGACTCCAAATATGAGGT\n+AATAAAGAACTGCCCCAAACCAGTAAACGCAGACGAAGCTAGACGCTTCGTGGCATTTTG\n+CAATTATTACAGAGGATTTATTAAG\n+>Locus_3_Transcript_1/5_Confidence_0.125_Length_388\n+CCACATAATACAAATAAATTTCAGGCATCGGAAAAATATATAAGGAGATTGCAGCCAGCA\n+CCTTGGGGAACCAGCAAAGATCTCAGCTCGTTATGGCCATCGACATTGTCCGGCATCTGA\n+CAAATTTTTGGATTTTGCTTCACGGCGATCATCTCTCAATCTCAATTTTGGACCCAATCT\n+CGATCGCCATCTTCCCCGGCTGCCAATCCATTTGGTCTGTGGCCAGTAGCTGTGGGGCTT\n+GATTGCGTCGATCGGCCTGTGTGGAATTGGAATTGGAAGTGGAGCTCTAGTTAACAGTTG\n+AGACCTGGACACCGAATGCGTGTCTCCGCCCCCGGGAACTGCAGAAGCAACAACTGCAGC\n+CGCAAACAATTGACAGAGACAAGCGCCT\n+>Locus_13_Transcript_2/3_Confidence_0.600_Length_223\n+CTTCTGAGAAATCAAAGCCGGTGGTCTCGAAAGGCGGAAGAGTGAGGATTTTGGTGGGCG\n+ATAGGTGCCAAGTGTTAAGGGTTGTCGTGGGTTGATAGTCGTTAAACGTCAGCGAGGGTC\n+AGTGGCTTACCAGGGTCTAAGGATATTTGAGATGTGGACTTTTGCTTAGAGAGGAAAATG\n+CAAGTGAAAGAGTCAAGCTATAATTGTAAATGGTGAACTACAA\n+>Locus_23_Transcript_1/1_Confidence_0.000_Length_197\n+AATTGAGATTAAATGACAAAACTCCGGTCTATATCAAAAACTATAGAATGCCAGAAAGTC\n+AAAAACCAGAAATTCAAAGGCAAGTTGACAAATTAATAAAAGATGGCATCGTCGAACCAT\n+CTATTTCAGAATATAATAGCCCTCTTCTCTTGGTACCCAAGAAATCACTGCCTAACTCGG\n+AGGAAAAGAGATGGCGA\n+>Locus_41_Transcript_1/2_Confidence_0.667_Length_155\n+CAACTGGACTCCAATTTCGACTCGGATGCGGCCCAGGTGGCCAGCTGCAGTTGCTGGGCG\n+GCAATAAAACATTTACCACCGAATTAGCCCAGTCGGAGAGTAGTTCAAGTAGTTTAAGTG\n+CAAGACCACTTAAAATTCAGTTACGACTGCTGCCC\n+>Locus_71_Transcript_1/2_Confidence_0.333_Length_170\n+GATTAATGAAAACATCTTTGGCAAATGCTTTCGCAGTCGGACGTCTCGCTACGGTCCAAG\n+AATTTCACCTCTCGCGTCGTAATACTAATGCCCCCAAACTGCTTCTATTAATCATTACCT\n+CTTGATCTGAAAACCAATGAAAGCAGAACAGAGGTCTTATTTCATTATCC\n+>Locus_64_Transcript_1/1_Confidence_0.000_Length_172\n+AGGGATGTTGTTTCCGTAAAGCGCCACGGTTCCTGTGGTGTCTCGTGCGCTCTATTCGGC\n+CCTTGAAAAACCGAGGGAGGCTATTTGAATTTCGTGCCAGGCCGTACCGATATCCGCAGC\n+AGGTCTCCAAGGTGAACAGCCTCTAGTCGATAGAATAATGTAGGTAAGGGAA\n+>Locus_5_Transcript_6/6_Confidence_0.111_Length_262\n+AGGGATTAATCCCTAATCAAAGATCCATTCTTACACTGAAGTTTGGTTAAGATTGAAGCT\n+ATAGCTTGAGCGGAGTTAAGCCATCTTCAAGGAAACTTCTTCTTACGAATCGCAGCGATC\n+CCTAAACCTATTTCGAGTTGACCCCAATTTCCAGACCTCCAATGGTTATTACGGTTAACC\n+AGAGATTACCCAACCGACTAACCGACTAACATTTGCCACTTCATTGCCCTGCCACCTTCC\n+GCCGAACGCCGAACGCCGAACG\n+>Locus_50_Transcript_1/1_Confidence_0.000_Length_132\n+TATAATGGATCATATCCGCGAATATATCACTGACATGACCCATTTTTAAGTTGTTGGTGA\n+ACAAGTCATGTTGAAGAATTTCCGCGCTGACAAAGCAAATAAGCACGAGGAACACAAACA\n+TTTTGATATGAC\n+>Locus_61_Transcript_1/2_Confidence_0.667_Length_164\n+CACCGTCCTGCTGTCTATATCAACCAACGCCTTTCATGGGGTCTCATGAGCGGGAAGTTT\n+GGCACTTTAACCCGACGTTTGGTTCATCCCACAGCGCCAGTTCTGCTTACCAAAAGTGGC\n+CCACTGGGCACATTATATCATAACCTTGAACTTCATATCAGGAA\n'