Repository 'graphclust_preprocessing'
hg clone https://toolshed.g2.bx.psu.edu/repos/rnateam/graphclust_preprocessing

Changeset 11:c0c9d19bc7b2 (2017-07-18)
Previous changeset 10:16bcaef3dc1e (2017-06-01) Next changeset 12:8a1786cdcf95 (2017-11-20)
Commit message:
planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust commit 746497a64b955f6b9afc1944d1c1d8d877e53267
modified:
preprocessing.xml
splitSHAPE.py
added:
splitStockholm.py
test-data/sample_4_alignment_data_split.stk
test-data/sample_4_all.stk
test-data/sample_4_representatives.fa
b
diff -r 16bcaef3dc1e -r c0c9d19bc7b2 preprocessing.xml
--- a/preprocessing.xml Thu Jun 01 12:11:37 2017 -0400
+++ b/preprocessing.xml Tue Jul 18 01:43:49 2017 -0400
b
@@ -1,7 +1,8 @@
-<tool id="preproc" name="Preprocessing" version="0.2">
+<tool id="preproc" name="Preprocessing" version="0.3">
   <requirements>
-    <requirement type="package" version="0.1.12">graphclust-wrappers</requirement>
+    <requirement type="package" version="0.3.1">graphclust-wrappers</requirement>
     <requirement type="package" version="3.0">zip</requirement>
+    <requirement type="package" version="1.70">biopython</requirement>
 
   </requirements>
     <stdio>
@@ -19,13 +20,22 @@
             &&
             python '$__tool_directory__/splitSHAPE.py' 
                 '$SHAPEdata'
-                $max_length
+                
         #end if
+
+        #if $AlignmentData:
+            &&
+            python '$__tool_directory__/splitStockholm.py' 
+                '$AlignmentData'
+                
+        #end if
+             
 ]]>
  </command>
     <inputs>
         <param type="data" name="fastaFile" format="fasta" />
         <param type="data" name="SHAPEdata" format="txt" optional="true" label="SHAPE data"/>
+        <param type="data" name="AlignmentData" format="stockholm" optional="true" label="Alignments file"/>
         <param name="max_length" type="integer" value="10000" size="5" label="window size"/>
         <param name="in_winShift" type="integer" value="100" size="5" label="window shift in percent"/>
         <param name="min_seq_length" type="integer" value="5" size="5" label="minimum sequence length"/>
@@ -36,7 +46,8 @@
         <data name="data.names" format="txt" from_work_dir="FASTA/data.names" label="data.names"/>
         <data name="data.fasta.scan" format="fasta" from_work_dir="FASTA/data.fasta.scan" label="data.fasta.scan"/>
         <data name="FASTA" format="zip" from_work_dir="FASTA.zip" label="FASTA.ZIP"/>
-        <data name="shape_data_split" format="txt" from_work_dir="shape_data_split.react" label="SHAPE data splited"/>
+        <data name="shape_data_split" format="txt" from_work_dir="shape_data_split.react" label="SHAPE.data.split"/>
+        <data name="alignment_data_split" format="stockholm" from_work_dir="alignment_data_split.stk" label="alignments.data.stk"/>
     </outputs>
     <tests>
         <test>
@@ -57,6 +68,14 @@
             <param name="in_winShift" value="50"/>
             <param name="min_seq_length" value="5"/>
             <output name="shape_data_split" file="sample_3_shape_data_split.react" />
+        </test> 
+        <test>
+            <param name="fastaFile" value="sample_4_representatives.fa"/>
+            <param name="AlignmentData" value="sample_4_all.stk"/>
+            <param name="max_length" value="50"/>
+            <param name="in_winShift" value="50"/>
+            <param name="min_seq_length" value="5"/>
+            <output name="alignment_data_split" file="sample_4_alignment_data_split.stk" />
         </test>        
     </tests>
     <help>
b
diff -r 16bcaef3dc1e -r c0c9d19bc7b2 splitSHAPE.py
--- a/splitSHAPE.py Thu Jun 01 12:11:37 2017 -0400
+++ b/splitSHAPE.py Tue Jul 18 01:43:49 2017 -0400
[
@@ -3,7 +3,6 @@
 import sys
 
 shape_file = sys.argv[1]
-win_size = int(sys.argv[2])
 
 pattern = re.compile("^>.*$")
 toWrite = ""
@@ -18,9 +17,9 @@
 name_file = "FASTA/data.names"
 array_all_chunks = []
 with open(name_file, 'r') as f:
-    content = f.read()
-    lines = content.split('\n')[:-1]
-    for line in lines:
+    for line in f:
+        if len(line.strip()) == 0:
+            continue
         seq_id.append(int(line.split()[0]))
         seq_string.append(line.split()[1])
         orig_id_srt = line.split()[3]
b
diff -r 16bcaef3dc1e -r c0c9d19bc7b2 splitStockholm.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/splitStockholm.py Tue Jul 18 01:43:49 2017 -0400
[
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+########
+# This script reads multiple alignments merged in single stockholm file
+# and splits the alignment blocks according to data.names table
+# The first sequence of each alignment file assumed to match to names table entries
+# Author: M. Miladi
+########
+import os
+import re
+import sys
+
+from Bio import AlignIO, SeqIO
+try:
+    from StringIO import StringIO
+except ImportError:
+    from io import StringIO
+
+stk_file = sys.argv[1]
+print ("Parsing and splitting stk file:{}".format(stk_file))
+target_f = "alignment_data_split.stk"
+pattern = re.compile("^>.*$")
+toWriteID = ""
+
+count_for_id = 1
+seq_counter = 0
+new_id = ""
+
+seq_id = []
+seq_string = []
+orig_id = []
+name_file = "FASTA/data.names"
+array_all_chunks = []
+with open(name_file, 'r') as f:
+    for line in f:
+        if len(line.strip()) == 0:
+            continue
+        seq_id.append(int(line.split()[0]))
+        seq_string.append(line.split()[1])
+        orig_id_srt = line.split()[3]
+        orig_id_srt = orig_id_srt.rsplit('_',1)[0]
+        orig_id.append(orig_id_srt)
+
+
+
+with open(stk_file) as stk_in:
+    alignments = AlignIO.parse(stk_in, "stockholm")#, alphabet=IUPAC.ambiguous_rna)  
+    alignments_dic = {(a[0].id):a for a in alignments}
+
+
+regx_gaps = '[-.~_]'  # valid gap symbols all be converted to "-"
+str_gaps = '-.~_'  # valid gap symbols all be converted to "-"
+
+
+chunks = []
+with open(target_f, 'w') as out_stk_handle:
+    for i in range(len(orig_id)):
+        
+        #----------------------
+        # We need to map ungapped positions of the chunks to gapped positions of first sequence 
+        gap_count = 0
+        ungap_ind = 0
+        dic_gap_counts = dict()
+        cur_alignment = alignments_dic[orig_id[i]]
+        for c in cur_alignment[0].seq:
+            #print ungap_ind
+            if c in str_gaps:
+                gap_count += 1
+            else:
+                dic_gap_counts[ungap_ind] = gap_count
+                ungap_ind += 1
+        ID =  str(seq_id[i]) + " " + seq_string[i] 
+        chunks = re.findall(r'\d+', seq_string[i])
+        print (ID,chunks)
+
+        index_start, index_end =int(chunks[1])-1, int(chunks[2])-1
+        subalign = cur_alignment[:, index_start + dic_gap_counts[index_start]:
+                           index_end+dic_gap_counts[index_end]+1]
+        
+        #----------------------
+        # BioPython does not handel the GF ID entry for alignment
+        # So we add entry in the second line manually
+        siotmp = StringIO()
+        AlignIO.write(subalign, siotmp, format="stockholm")
+        stk_lines = siotmp.getvalue().split('\n')
+        out_stk_handle.write('{}\n'.format(stk_lines[0]))
+        out_stk_handle.write('#=GF ID {}\n'.format(ID))
+        out_stk_handle.writelines('\n'.join(stk_lines[1:]))
+        #print out_stk_handle.getvalue()
+
+        
b
diff -r 16bcaef3dc1e -r c0c9d19bc7b2 test-data/sample_4_alignment_data_split.stk
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_4_alignment_data_split.stk Tue Jul 18 01:43:49 2017 -0400
b
b'@@ -0,0 +1,798 @@\n+# STOCKHOLM 1.0\n+#=GF ID 1 SEQ1#1#50#+\n+#=GF SQ 5\n+ath1 TAACTCGGAAGTTGTCGATTGAACAAACTTGAGGTTTTGTCGTTTCCACG\n+#=GS ath1 AC ath1\n+#=GS ath1 DE ath1\n+bol1 TTACTTTGAAGTTGTCAACTAGGCAACCGCGAGGTTTTGTCTCCTTGACG\n+#=GS bol1 AC bol1\n+#=GS bol1 DE bol1\n+bna1 TTACTTTGAAGTTGTCAACTAGGCAACCGCGAGGTTTTGTCTCCTTGACG\n+#=GS bna1 AC bna1\n+#=GS bna1 DE bna1\n+bra1 TTACTCGGAAGTTGTCAACTAGGGAACCGCGAGGTTTTGTCGCCTTGACG\n+#=GS bra1 AC bra1\n+#=GS bra1 DE bra1\n+aly1 TTACTCGGAAGTTGTCGATTGAACAAACTTGAGGTTTTATCGTCTTCACA\n+#=GS aly1 AC aly1\n+#=GS aly1 DE aly1\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 2 SEQ1#26#75#+\n+#=GF SQ 5\n+ath1 AACTTGAGGTTTTGTCGTTTCCACG---------GCTGTCGTAGACGGTGGCAGCTGCT\n+#=GS ath1 AC ath1\n+#=GS ath1 DE ath1\n+bol1 ACCGCGAGGTTTTGTCTCCTTGACGGTCTTCAACACCGCTGTTGATGGTGGTGGCACG-\n+#=GS bol1 AC bol1\n+#=GS bol1 DE bol1\n+bna1 ACCGCGAGGTTTTGTCTCCTTGACGGTCTTCAACACCGCTGTTGATGGTGGCGGCACG-\n+#=GS bna1 AC bna1\n+#=GS bna1 DE bna1\n+bra1 ACCGCGAGGTTTTGTCGCCTTGACGGTCTTCAACACCGTCGTCGATGGT---GGCACG-\n+#=GS bra1 AC bra1\n+#=GS bra1 DE bra1\n+aly1 AACTTGAGGTTTTATCGTCTTCACATCTCTCACCGCCGCCGGAGACGGTGGCTGCTGCT\n+#=GS aly1 AC aly1\n+#=GS aly1 DE aly1\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 3 SEQ1#51#100#+\n+#=GF SQ 5\n+ath1 GCTGTCGTAGACGGTGGCAGCTGCTGCTGCAGCGGTTGATGATAGTGGTA\n+#=GS ath1 AC ath1\n+#=GS ath1 DE ath1\n+bol1 ACCGCTGTTGATGGTGGTGGCACG---TGTAACGTTTGGTGGTTATAGTA\n+#=GS bol1 AC bol1\n+#=GS bol1 DE bol1\n+bna1 ACCGCTGTTGATGGTGGCGGCACG---TGTAACGTTTGATGGTTATAGTA\n+#=GS bna1 AC bna1\n+#=GS bna1 DE bna1\n+bra1 ACCGTCGTCGATGGT---GGCACG---------------TGATTATAGTA\n+#=GS bra1 AC bra1\n+#=GS bra1 DE bra1\n+aly1 GCCGCCGGAGACGGTGGCTGCTGCTGCTGCAGCGGTTGATGAGAGTAGTA\n+#=GS aly1 AC aly1\n+#=GS aly1 DE aly1\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 4 SEQ1#62#111#+\n+#=GF SQ 5\n+ath1 CGGTGGCAGCTGCTGCTGCAGCGGTTGATGATAGTGGTAGGCGGAGAAGT\n+#=GS ath1 AC ath1\n+#=GS ath1 DE ath1\n+bol1 TGGTGGTGGCACG---TGTAACGTTTGGTGGTTATAGTAAGCTGTCAAGT\n+#=GS bol1 AC bol1\n+#=GS bol1 DE bol1\n+bna1 TGGTGGCGGCACG---TGTAACGTTTGATGGTTATAGTAAGCTGTCAAGT\n+#=GS bna1 AC bna1\n+#=GS bna1 DE bna1\n+bra1 TGGT---GGCACG---------------TGATTATAGTATGCGGTCAAGT\n+#=GS bra1 AC bra1\n+#=GS bra1 DE bra1\n+aly1 CGGTGGCTGCTGCTGCTGCAGCGGTTGATGAGAGTAGTAGGCGGAGAAGT\n+#=GS aly1 AC aly1\n+#=GS aly1 DE aly1\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 5 SEQ2#1#50#+\n+#=GF SQ 5\n+ath2 GGTCGAGAAAGGAACCGGCAATATCGAACCGGAAAAATCCGAGATAACCA\n+#=GS ath2 AC ath2\n+#=GS ath2 DE ath2\n+bol2 AGCAGAGAACGGAACCGGTAGAATTGAACCGGTTGAACCGGAGCTGACCA\n+#=GS bol2 AC bol2\n+#=GS bol2 DE bol2\n+bna2 AGCAGAGAACGGAACCGGTAGAATTGAACCGGTTGAACCGGAGCTGACCA\n+#=GS bna2 AC bna2\n+#=GS bna2 DE bna2\n+bra2 AGCAGAGAACGGAACCGGTAAAATCGAACCGGTTGAACCGGAGCTGACCA\n+#=GS bra2 AC bra2\n+#=GS bra2 DE bra2\n+aly2 GGTCGAGAAAGGAACCGGCAAAATCGAACCGGAAACATTCGAGCTAACCA\n+#=GS aly2 AC aly2\n+#=GS aly2 DE aly2\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 6 SEQ2#26#75#+\n+#=GF SQ 5\n+ath2 GAACCGGAAAAATCCGAGATAACCACGTTTTGCATAAACTGGTACATAAG\n+#=GS ath2 AC ath2\n+#=GS ath2 DE ath2\n+bol2 GAACCGGTTGAACCGGAGCTGACCACGTTTTGCATAAACTGGTACAAAAG\n+#=GS bol2 AC bol2\n+#=GS bol2 DE bol2\n+bna2 GAACCGGTTGAACCGGAGCTGACCACGTTTTGCATAAACTGGTACAAAAG\n+#=GS bna2 AC bna2\n+#=GS bna2 DE bna2\n+bra2 GAACCGGTTGAACCGGAGCTGACCACGTTCTGCATAAACTGGTACAAAAG\n+#=GS bra2 AC bra2\n+#=GS bra2 DE bra2\n+aly2 GAACCGGAAACATTCGAGCTAACCACGTTTTGCATAAACTGGTACATAAG\n+#=GS aly2 AC aly2\n+#=GS aly2 DE aly2\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 7 SEQ2#51#100#+\n+#=GF SQ 5\n+ath2 CGTTTTGCATAAACTGGTACATAAGCAGAACGTCACCGTTAACCAAAGCC\n+#=GS ath2 AC ath2\n+#=GS ath2 DE ath2\n+bol2 CGTTTTGCATAAACTGGTACAAAAGCAAGACATCGCCGTTCACCAACGCC\n+#=GS bol2 AC bol2\n+#=GS bol2 DE bol2\n+bna2 CGTTTTGCATAAACTGGTACAAAAGCAAGACATCGCCGTTCACCAGCGCC\n+#=GS bna2 AC bna2\n+#=GS bna2 DE bna2\n+bra2 CGTTCTGCATAAACTGGTACAAAAGCAAGACATCGCCGTTCACCAACGCC\n+#=GS bra2 AC bra2\n+#=GS bra2 DE bra2\n+aly2 CGTTTTGCATAAACTGGTACATAAGCAAAACGTCACCGTTAACCAAAGCC\n+#=GS aly2 AC aly2\n+#=GS aly2 DE aly2\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 8 SEQ2#71#120#+\n+#=GF SQ 5\n+ath2 ATAAGCAGAACGTCACCGTTAACCAAAGCCATGTCCTTAAACCGGTCTCG\n+#=GS ath2 AC ath2\n+#=GS ath2 DE ath2\n+bol2 AAAAGCAAGACATCGCCGTTCACCAACGCCATGTCTTTAAACCGGTCTCG\n+#=GS bol2 AC'..b' bol8\n+#=GS bol8 DE bol8\n+bna8 CCGTCTTAGGGTTCGCCACGTCATCCATACCCTGAAGCATAATCAATAAC\n+#=GS bna8 AC bna8\n+#=GS bna8 DE bna8\n+aly8 CCGTTTTAGGGTTTGCAACATCATCCATAACCTGAAACACAAGGAATCAC\n+#=GS aly8 AC aly8\n+#=GS aly8 DE aly8\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 36 SEQ8#68#117#+\n+#=GF SQ 5\n+ath8 ACATCATCCATACCCTGAAACACAAGCAATCACCAAAA---ACTTAAACAAAG\n+#=GS ath8 AC ath8\n+#=GS ath8 DE ath8\n+bra8 ACATCATCCATACCCTGAAGCATAATCAATCACAAAACAATATTT-AAAAGAG\n+#=GS bra8 AC bra8\n+#=GS bra8 DE bra8\n+bol8 ACGTCATCCATACCCTGAAGCATAATCAATAACAAAACAATATTT-AAAAGA-\n+#=GS bol8 AC bol8\n+#=GS bol8 DE bol8\n+bna8 ACGTCATCCATACCCTGAAGCATAATCAATAACAAAACAATATTT-AAAAGA-\n+#=GS bna8 AC bna8\n+#=GS bna8 DE bna8\n+aly8 ACATCATCCATAACCTGAAACACAAGGAATCAC--------ACTT-AACAGAG\n+#=GS aly8 AC aly8\n+#=GS aly8 DE aly8\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 37 SEQ9#1#50#+\n+#=GF SQ 5\n+ath9 CGTAGAAAAGGCTTGACCGCAAAATGGATATATTATATGTACCTATGAGT\n+#=GS ath9 AC ath9\n+#=GS ath9 DE ath9\n+bna9 C-TACCAAATGC--AAAGATAAGGCAAA------------ACCTATGAGT\n+#=GS bna9 AC bna9\n+#=GS bna9 DE bna9\n+bol9 C-TACCAAATGC--AAAGATAAGGCAAA------------ACCTATGAGT\n+#=GS bol9 AC bol9\n+#=GS bol9 DE bol9\n+bra9 C-TACCAAATGC--AAAGATAAGGCAAA------------ACCTATGAGT\n+#=GS bra9 AC bra9\n+#=GS bra9 DE bra9\n+aly9 CATAGAAAAGGCTTGACCACAAACTGGA-----TATATATACCTATGAGT\n+#=GS aly9 AC aly9\n+#=GS aly9 DE aly9\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 38 SEQ9#26#75#+\n+#=GF SQ 5\n+ath9 GGATATATTATATGTACCTATGAGTTTTTCGAGCTTTGTTTGAAGTAGTA\n+#=GS ath9 AC ath9\n+#=GS ath9 DE ath9\n+bna9 AAA------------ACCTATGAGTTTCTCGAGCTTTGTTTGAAGTAGTA\n+#=GS bna9 AC bna9\n+#=GS bna9 DE bna9\n+bol9 AAA------------ACCTATGAGTTTCTCGAGCTTTGTTTGAAGTAGTA\n+#=GS bol9 AC bol9\n+#=GS bol9 DE bol9\n+bra9 AAA------------ACCTATGAGTTTCTCGAGCTTTGTTTGAAGAAGTA\n+#=GS bra9 AC bra9\n+#=GS bra9 DE bra9\n+aly9 GGA-----TATATATACCTATGAGTTTTTCGAGCTTTGTTTGAAGTAGTA\n+#=GS aly9 AC aly9\n+#=GS aly9 DE aly9\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 39 SEQ9#51#100#+\n+#=GF SQ 5\n+ath9 TTTTCGAGCTTTGTTTGAAGTAGTAATCCTC---TCTACAATATTGAAGCCAA\n+#=GS ath9 AC ath9\n+#=GS ath9 DE ath9\n+bna9 TTCTCGAGCTTTGTTTGAAGTAGTAATCCTCTTCCCTGTAATATTCAAGCAAA\n+#=GS bna9 AC bna9\n+#=GS bna9 DE bna9\n+bol9 TTCTCGAGCTTTGTTTGAAGTAGTAATCCTCTTCCCTGTAATATTCAAGCAAA\n+#=GS bol9 AC bol9\n+#=GS bol9 DE bol9\n+bra9 TTCTCGAGCTTTGTTTGAAGAAGTAATCCTCTTCCCTATAATATCCAAGCAAA\n+#=GS bra9 AC bra9\n+#=GS bra9 DE bra9\n+aly9 TTTTCGAGCTTTGTTTGAAGTAGTAATCCTC---CCTAAAATATTGAAGCCAA\n+#=GS aly9 AC aly9\n+#=GS aly9 DE aly9\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 40 SEQ9#76#125#+\n+#=GF SQ 5\n+ath9 ATCCTC---TCTACAATATTGAAGCCAA-CTATGGTCAAACCACAATCAAATTC\n+#=GS ath9 AC ath9\n+#=GS ath9 DE ath9\n+bna9 ATCCTCTTCCCTGTAATATTCAAGCAAAGCTGTGAGTAAACTACAACC------\n+#=GS bna9 AC bna9\n+#=GS bna9 DE bna9\n+bol9 ATCCTCTTCCCTGTAATATTCAAGCAAAGCTGTGAGTAAACTACAACC------\n+#=GS bol9 AC bol9\n+#=GS bol9 DE bol9\n+bra9 ATCCTCTTCCCTATAATATCCAAGCAAAGCTGTGAGTAAACTGCAACC------\n+#=GS bra9 AC bra9\n+#=GS bra9 DE bra9\n+aly9 ATCCTC---CCTAAAATATTGAAGCCAA-CTACTGTCAAACCACAATAAATTTC\n+#=GS aly9 AC aly9\n+#=GS aly9 DE aly9\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 41 SEQ9#101#150#+\n+#=GF SQ 5\n+ath9 CTATGGTCAAACCACAATCAAATTCCCTATAGCTCCTCaaaaaaaaCTAC\n+#=GS ath9 AC ath9\n+#=GS ath9 DE ath9\n+bna9 CTGTGAGTAAACTACAACC----------TATTTGGGCAAA---------\n+#=GS bna9 AC bna9\n+#=GS bna9 DE bna9\n+bol9 CTGTGAGTAAACTACAACC----------TATTTGGGCAAA---------\n+#=GS bol9 AC bol9\n+#=GS bol9 DE bol9\n+bra9 CTGTGAGTAAACTGCAACC----------TAATTGGGCAAA---------\n+#=GS bra9 AC bra9\n+#=GS bra9 DE bra9\n+aly9 CTACTGTCAAACCACAATAAATTTCCCTATAGCTCCTCAAA---------\n+#=GS aly9 AC aly9\n+#=GS aly9 DE aly9\n+//\n+# STOCKHOLM 1.0\n+#=GF ID 42 SEQ9#107#156#+\n+#=GF SQ 5\n+ath9 TCAAACCACAATCAAATTCCCTATAGCTCCTCaaaaaaaaCTACTCAAGC\n+#=GS ath9 AC ath9\n+#=GS ath9 DE ath9\n+bna9 GTAAACTACAACC----------TATTTGGGCAAA---------------\n+#=GS bna9 AC bna9\n+#=GS bna9 DE bna9\n+bol9 GTAAACTACAACC----------TATTTGGGCAAA---------------\n+#=GS bol9 AC bol9\n+#=GS bol9 DE bol9\n+bra9 GTAAACTGCAACC----------TAATTGGGCAAA---------------\n+#=GS bra9 AC bra9\n+#=GS bra9 DE bra9\n+aly9 TCAAACCACAATAAATTTCCCTATAGCTCCTCAAA---------------\n+#=GS aly9 AC aly9\n+#=GS aly9 DE aly9\n+//\n'
b
diff -r 16bcaef3dc1e -r c0c9d19bc7b2 test-data/sample_4_all.stk
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_4_all.stk Tue Jul 18 01:43:49 2017 -0400
b
@@ -0,0 +1,72 @@
+# STOCKHOLM 1.0
+#=GF SQ 5
+ath1  TAACTCGGAAGTTGTCGATTGAACAAACTTGAGGTTTTGTCGTTTCCACG---------GCTGTCGTAGACGGTGGCAGCTGCTGCTGCAGCGGTTGATGATAGTGGTAGGCGGAGAAGT 
+bol1  TTACTTTGAAGTTGTCAACTAGGCAACCGCGAGGTTTTGTCTCCTTGACGGTCTTCAACACCGCTGTTGATGGTGGTGGCACG---TGTAACGTTTGGTGGTTATAGTAAGCTGTCAAGT 
+bna1  TTACTTTGAAGTTGTCAACTAGGCAACCGCGAGGTTTTGTCTCCTTGACGGTCTTCAACACCGCTGTTGATGGTGGCGGCACG---TGTAACGTTTGATGGTTATAGTAAGCTGTCAAGT 
+bra1  TTACTCGGAAGTTGTCAACTAGGGAACCGCGAGGTTTTGTCGCCTTGACGGTCTTCAACACCGTCGTCGATGGT---GGCACG---------------TGATTATAGTATGCGGTCAAGT 
+aly1  TTACTCGGAAGTTGTCGATTGAACAAACTTGAGGTTTTATCGTCTTCACATCTCTCACCGCCGCCGGAGACGGTGGCTGCTGCTGCTGCAGCGGTTGATGAGAGTAGTAGGCGGAGAAGT 
+//
+# STOCKHOLM 1.0
+#=GF SQ 5
+ath2  GGTCGAGAAAGGAACCGGCAATATCGAACCGGAAAAATCCGAGATAACCACGTTTTGCATAAACTGGTACATAAGCAGAACGTCACCGTTAACCAAAGCCATGTCCTTAAACCGGTCTCG 
+bol2  AGCAGAGAACGGAACCGGTAGAATTGAACCGGTTGAACCGGAGCTGACCACGTTTTGCATAAACTGGTACAAAAGCAAGACATCGCCGTTCACCAACGCCATGTCTTTAAACCGGTCTCG 
+bna2  AGCAGAGAACGGAACCGGTAGAATTGAACCGGTTGAACCGGAGCTGACCACGTTTTGCATAAACTGGTACAAAAGCAAGACATCGCCGTTCACCAGCGCCATGTCTTTAAACCGGTCTCG 
+bra2  AGCAGAGAACGGAACCGGTAAAATCGAACCGGTTGAACCGGAGCTGACCACGTTCTGCATAAACTGGTACAAAAGCAAGACATCGCCGTTCACCAACGCCATGTCTTTAAACCGGTCTCG 
+aly2  GGTCGAGAAAGGAACCGGCAAAATCGAACCGGAAACATTCGAGCTAACCACGTTTTGCATAAACTGGTACATAAGCAAAACGTCACCGTTAACCAAAGCCACGTCCTTAAACCGGTCTCG 
+//
+# STOCKHOLM 1.0
+#=GF SQ 5
+ath3  CTGCACCATCTCCAAACAGAGCAGCTCCAACTAGGTCGTACGGACGCGCTTTGTTTGGTGGCCGAAATCCGAGAATGGTGGTTTCAGAGGTTGTGAGCAGAACACGGCTTCCGGGGTTGTTCTCAGCAATGTCTTTGGCCACACGGAGGCCAGTCACACCTCCGTAGCATCCCAGAAAATACAGCATCACTCTGTTCACGTCATTCCTCAGGCCTAGCTTTGCTGAGAGGTAAAGGTCAC 
+bna3  CCGCACCGTCCCCGAACAAAGCTGCCCCGACCAGGTCGTAAGGGCGAGCTTTGTTCGGTGGTCGGAACCCGAGGATGGTGGTTTCAGAGGTTGTGAGGAGCACACGGCTTCCGGGGTTGTTCTCGGCTATGTCTTTGGCGACACGGAGGCCTGTTACGCCTCCGTAGCAGCCTAGAAAATAAAGCATCACTCTGTTCACGTCGTTCTTTAAGCCTAGCTTGGCGGAGAGGTAAAGGTCAC 
+bol3  CCGCACCGTCCCCGAACAAAGCTGCCCCGACCAGGTCGTAAGAGCGAGCTTTGTTCGGTGGTCGGAACCCGAGGATGGTGGTTTCAGAGGTTGTGAGGAGCACACGGCTTCCGGGGTTGTTCTCGGCTATGTCTTTGGCGACACGGAGGCCTGTTACGCCTCCGTAGCAGCCTAGAAAGTAAAGCATTACTCTGTTCACGTCGCTCTTTAGGCCTAGCTTGGCTGAGAGGTAAAGGTCGC 
+bra3  CTGCACCGTCCCCGAACAAAGCTGCCCCGACCAGGTCGTAAGGGCGAGCTTTGTTCGGTGGTCGGAACCCGAGGATGGTGGTTTCAGAGGTTGTGAGGAGAACACGGCTTCCGGGGTTGTTCTCAGCGATGTCTTTGGCCACACGGAGGCCTGTTACGCCTCCGTAGCAGCCTAGAAAATAAAGCATTACTCTGTTCACGTCGCTCTTTAGGCCTAGCTTGGCTGAGAGGTAAAGGTCGC 
+aly3  CTGCACCATCTCCAAACAGAGCAGCTCCAACTAGGTCGTACGGACGAGCTTTGTTTGGTGGGCGAAACCCGAGAATGGTGGTTTCAGAGGTTGTGAGCAGAACACGGCTTCCCGGGTTGTTCTCAGCAATGTCTTTGGCCACACGGAGGCCAGTCACACCTCCGTAGCATCCCAGAAAATACAGCATCACTCTGTTCACGTCATTCCTCAGGCCTAGCTTTGCTGAGAGGTAAAGGTCAC 
+//
+# STOCKHOLM 1.0
+#=GF SQ 5
+ath4  A----------TGAAGGAGGACTACTTACGAGTGTTGGAAGCGGCAGCGGCAGTGACCGCAGCTGAAGAGCTTGTCGGAGAAACCGACATCACCGCACATGCAACACACTCTCTCCATGa 
+bna4  ATATATAGCCTTGGAGAAGG--TACGTACGAGTGTTGAAAGCGGTTGCGGCAGTGCCCGCAGCGGAATAGTTTGTCGGAAAATCCGACGTCGCCGCACATGCAGCACACTTTCTCCATGT 
+bol4  ATATATAGCCTTGGAGAAGG--TACGTACGAGTGTTGAAAGCGGTTGCGGCAGTGCCCGCAGCGGAATAGTTTGTCGGAAAATCCGACGTCGCCGCACATGCAGCACACTTTCTCCATGT 
+bra4  ATATATGGCCTAGGAGAAGG--TACGTACGAGTGTTGAAAGCGGTTGCGGCAGTGCCCGCAGCGGAATAGCTTGTCGGAAAAGCCGACGTCGCCGCACATGCAGCACACTTTCTCCATGT 
+aly4  -----------TAAAGGAGGACTACTTACGAGTGCTGTAAGCGGCAGCGGCAGTGCCCGCAGCTGAAGAGCTTGTCGGAGAAACCGACGTCGCCGCACATGCAACATACTCTCTCCATGA 
+//
+# STOCKHOLM 1.0
+#=GF SQ 5
+ath5  AACGGCGTCAAGGATCTCCTTCAAATTCTTGTGCCTTACCACCA-ATTTGGT--------------------CTTGTAGCAGTCGGAAGAAGTGGTAACATGGTCAGAGTGCTC 
+bra5  GATCGCATCAGCGATCTCTTTCAAATCCCTATGCCTAACGGCGACATCTCCT------------CCTCCACCTCTATAGCTACCTGAAAACGTCGTCGCATCATCTGCTTTATC 
+bol5  GATGGCATCAGCGATCTCTTTCAAATCCCTATGCCTAACGgcgacatctcctcctccgcctccgcctCCGCCTCTATAGCTACCGCAATACGTCGTCGCATCATCTTCTTTATC 
+bna5  GATGGCATCAGCGATCTCTTTCAAATCCCTATGCCTAACGGCGACATCTCCT------CCTCCGCCTCCGCCTCTATAGCTACCGCAATACGTCGTCGCATCATCTTCTTTATC 
+aly5  AACGGCGTCGAGGATCTCCTTCAAATTCTTGTGCCTTACCACCA-ATTTGGT--------------------CTTGTAGCAGTCGGAAGAAGTGGTAACATGGTCAGAGTTGTC 
+//
+# STOCKHOLM 1.0
+#=GF SQ 5
+ath6  TAAAAAATAAAGAAT---CTTACCATCACCACGACTGTTTGTTCTAGCCAACTGATAAATAGTGTAGCCTGAAGATGAAAGCTGGTGGTGGTACATGTTCACTAATTCCTCATTCCCAAC 
+bol6  TAAAATGGAGAACCTTTGCTTACCATCGCCACGACGGTTTGTTCTAGGCAACTGGAAAATATTGTAGCCTGCAGTGGCAAGGCGATCATGGTACATGTTCACCAATTCTTCATTTGCAAC 
+bna6  TAAAATGGAGAACCTTTACTTACCATCGCCACGACGGTTTGTTCTAGGCAACTGGAAAATATTGTAGCCTGCAGTGGCAAGGCGATCATGGTACATGTTCACCAATTCTTCATTTGCAAC 
+bra6  TAAAATGGAGAACCTTTACTTACCATCGCCACGACGGTTTGTTCTAGGCAACTGGAAAATATCGTAGCCTGCAGTGGCAAGGCGATCATGGTACATGTTCGCCAATTCTTCATTTGCAAC 
+aly6  TaaaaaaaaaaGAAT---GTTACCATCACCACGACTGTTTGTTCTAGCCAACTGGAAAATAGTGTAGCCAGAAGTGCCAAGCTGGTCGTGGTACATGTTGACTAATTCCTCATTCCCAAC 
+//
+# STOCKHOLM 1.0
+#=GF SQ 5
+ath7  TGGATCTCTGGATCAGCTGTGCGACCAGACGTGGATGATCCAAACGCTAAACGCCTTTGAGCGATTGGTCGAACACGTGAACGCGCCGCTGAAGCAAACGCTAATAATCGGGATTG---C 
+bra7  TGAAGCTCCGGGTCAGCAGTTCGGCCGGACGTTGATGATCCAAACGCTAAACTCCTTTGAACGACTGGTCGGACGCGGGAACGCGCCGCTGAAGCAAACGCTAATAATCGGGACTGCATC 
+bol7  TGAAGCTCCGGATCAGCAGCTCGGCCGGACGTTGAAGATCCAAACGCTAAACGACTTTGAGCGACTGGTCGGACGCGGGAACGCGCCGCTGAAGCAAACGCTAATAATCGGGACTGCATC 
+bna7  TGAAGCTCCGGATCAGCAGCTCGGCCGGACGTTGAAGATCCAAACGCTAAACGACTTTGAGCGACTGGTCGGACGCGGGAACGCGCCGCTGAAGCAAACGCTAATAATCGGGACTGCATC 
+aly7  TGGATCTCTGGATCAGCGGTGCGGCCAGACGTGGATGATCCAAACGCTAAACGCCTTTGAGCGATTGGTCGAACACGGGAACGCGCCGCAGAAGCAAACGCTAATAATCGGGATTGCATC 
+//
+# STOCKHOLM 1.0
+#=GF SQ 5
+ath8  TTGCTCTTCTAAACTCGGTCGGGGAGTATCGTCTACGATTTCTTCCGCCGCCGTTTTAGGGTTTGCAACATCATCCATACCCTGAAACACAAGCAATCACCAAAA---ACTTAAACAAAG 
+bra8  TTGCTCTTCTAAACTTGACCGTGGTTGTTCATCGACGATTTCTTCTGCCGCCGTCTTAGGGTTCGCTACATCATCCATACCCTGAAGCATAATCAATCACAAAACAATATTT-AAAAGAG 
+bol8  TtgttgtTctaaacttgaccgtggttGTTCGTCGACGATTTCTTCTGCCGCCGTCTTAGGGTTCGCCACGTCATCCATACCCTGAAGCATAATCAATAACAAAACAATATTT-AAAAGA- 
+bna8  TTGTTGTTCTAAACTTGACCGTGGTTGTTCGTCGACGATTTCTTCTGCCGCCGTCTTAGGGTTCGCCACGTCATCCATACCCTGAAGCATAATCAATAACAAAACAATATTT-AAAAGA- 
+aly8  TTGCTCTTCTAAACTCGGTCGTGGGGTGTCGTCTACGATTTCATCCGCCGCCGTTTTAGGGTTTGCAACATCATCCATAACCTGAAACACAAGGAATCAC--------ACTT-AACAGAG 
+//
+# STOCKHOLM 1.0
+#=GF SQ 5
+ath9  CGTAGAAAAGGCTTGACCGCAAAATGGATATATTATATGTACCTATGAGTTTTTCGAGCTTTGTTTGAAGTAGTAATCCTC---TCTACAATATTGAAGCCAA-CTATGGTCAAACCACAATCAAATTCCCTATAGCTCCTCaaaaaaaaCTACTCAAGC 
+bna9  C-TACCAAATGC--AAAGATAAGGCAAA------------ACCTATGAGTTTCTCGAGCTTTGTTTGAAGTAGTAATCCTCTTCCCTGTAATATTCAAGCAAAGCTGTGAGTAAACTACAACC----------TATTTGGGCAAA--------------- 
+bol9  C-TACCAAATGC--AAAGATAAGGCAAA------------ACCTATGAGTTTCTCGAGCTTTGTTTGAAGTAGTAATCCTCTTCCCTGTAATATTCAAGCAAAGCTGTGAGTAAACTACAACC----------TATTTGGGCAAA--------------- 
+bra9  C-TACCAAATGC--AAAGATAAGGCAAA------------ACCTATGAGTTTCTCGAGCTTTGTTTGAAGAAGTAATCCTCTTCCCTATAATATCCAAGCAAAGCTGTGAGTAAACTGCAACC----------TAATTGGGCAAA--------------- 
+aly9  CATAGAAAAGGCTTGACCACAAACTGGA-----TATATATACCTATGAGTTTTTCGAGCTTTGTTTGAAGTAGTAATCCTC---CCTAAAATATTGAAGCCAA-CTACTGTCAAACCACAATAAATTTCCCTATAGCTCCTCAAA--------------- 
+//
b
diff -r 16bcaef3dc1e -r c0c9d19bc7b2 test-data/sample_4_representatives.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_4_representatives.fa Tue Jul 18 01:43:49 2017 -0400
b
@@ -0,0 +1,18 @@
+>ath1
+TAACTCGGAAGTTGTCGATTGAACAAACTTGAGGTTTTGTCGTTTCCACGGCTGTCGTAGACGGTGGCAGCTGCTGCTGCAGCGGTTGATGATAGTGGTAGGCGGAGAAGT
+>ath2
+GGTCGAGAAAGGAACCGGCAATATCGAACCGGAAAAATCCGAGATAACCACGTTTTGCATAAACTGGTACATAAGCAGAACGTCACCGTTAACCAAAGCCATGTCCTTAAACCGGTCTCG
+>ath3
+CTGCACCATCTCCAAACAGAGCAGCTCCAACTAGGTCGTACGGACGCGCTTTGTTTGGTGGCCGAAATCCGAGAATGGTGGTTTCAGAGGTTGTGAGCAGAACACGGCTTCCGGGGTTGTTCTCAGCAATGTCTTTGGCCACACGGAGGCCAGTCACACCTCCGTAGCATCCCAGAAAATACAGCATCACTCTGTTCACGTCATTCCTCAGGCCTAGCTTTGCTGAGAGGTAAAGGTCAC
+>ath4
+ATGAAGGAGGACTACTTACGAGTGTTGGAAGCGGCAGCGGCAGTGACCGCAGCTGAAGAGCTTGTCGGAGAAACCGACATCACCGCACATGCAACACACTCTCTCCATGa
+>ath5
+AACGGCGTCAAGGATCTCCTTCAAATTCTTGTGCCTTACCACCAATTTGGTCTTGTAGCAGTCGGAAGAAGTGGTAACATGGTCAGAGTGCTC
+>ath6
+TAAAAAATAAAGAATCTTACCATCACCACGACTGTTTGTTCTAGCCAACTGATAAATAGTGTAGCCTGAAGATGAAAGCTGGTGGTGGTACATGTTCACTAATTCCTCATTCCCAAC
+>ath7
+TGGATCTCTGGATCAGCTGTGCGACCAGACGTGGATGATCCAAACGCTAAACGCCTTTGAGCGATTGGTCGAACACGTGAACGCGCCGCTGAAGCAAACGCTAATAATCGGGATTGC
+>ath8
+TTGCTCTTCTAAACTCGGTCGGGGAGTATCGTCTACGATTTCTTCCGCCGCCGTTTTAGGGTTTGCAACATCATCCATACCCTGAAACACAAGCAATCACCAAAAACTTAAACAAAG
+>ath9
+CGTAGAAAAGGCTTGACCGCAAAATGGATATATTATATGTACCTATGAGTTTTTCGAGCTTTGTTTGAAGTAGTAATCCTCTCTACAATATTGAAGCCAACTATGGTCAAACCACAATCAAATTCCCTATAGCTCCTCaaaaaaaaCTACTCAAGC