Repository 'cd_hit'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/cd_hit

Changeset 0:e0da3400ac2f (2018-10-15)
Next changeset 1:7807800a3d03 (2021-11-05)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/cdhit commit 8e14fc2573a53eaf8a538e018ae292f4c3134ec2
added:
cd_hit.xml
test-data/cd_hit_est_in.fa
test-data/cd_hit_protein_in.fasta
test-data/db1.fasta
test-data/db2.fasta
test-data/est-2d.txt
test-data/est-2d.txt.clstr
test-data/est_clusters_output.txt
test-data/est_fasta_output.fasta
test-data/protein_clusters_output.txt
test-data/protein_clusters_output_local.txt
test-data/protein_fasta_output.fasta
test-data/protein_fasta_output_local.fasta
b
diff -r 000000000000 -r e0da3400ac2f cd_hit.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cd_hit.xml Mon Oct 15 10:54:50 2018 -0400
[
b'@@ -0,0 +1,283 @@\n+<tool id="cd_hit" name="cd-hit" version="4.6.8.1">\n+    <description>Cluster or compare biological sequence datasets</description>\n+    <requirements>\n+        <requirement type="package" version="4.6.8">cd-hit</requirement>\n+    </requirements>\n+    <version_command><![CDATA[\n+cd-hit | grep "CD-HIT version" | cut -d" " -f 4\n+    ]]></version_command>\n+    <command detect_errors="exit_code"><![CDATA[\n+cd-hit$est.est_select$twod.twod_select\n+-i \'$fasta_in\'\n+-o rep_seq\n+-c $est.similarity\n+-n $est.wordsize\n+#if $est.est_select == \'-est\':\n+    $est.strand\n+    #if str($est.estadvalign.mask) != \'None\':\n+        -mask \'$est.estadvalign.mask\'\n+    #end if\n+    -match $est.estadvalign.match\n+    -mismatch $est.estadvalign.mismatch\n+    -gap $est.estadvalign.gap\n+    -gap-ext $est.estadvalign.gapext\n+#else:\n+    -t $est.redtol\n+#end if\n+#if $twod.twod_select == \'-2d\':\n+    -i2 \'$fasta_in2\'\n+    #if $advanced.advalign.style == \'local\':\n+        -s2 $advanced.advalign.advancedtwod.cutoff_diff_len2\n+        -S2 $advanced.advalign.advancedtwod.aa_cutoff_diff_len2\n+    #end if\n+#end if\n+\n+-b $advanced.band_width\n+-l $advanced.throw_away_len\n+#if $advanced.advalign.style == \'local\':\n+    -G 0\n+    -aL $advanced.advalign.align_coverage_long\n+    -AL $advanced.advalign.align_coverage_long_control\n+    -aS $advanced.advalign.align_coverage_short\n+    -AS $advanced.advalign.align_coverage_short_control\n+    -A $advanced.advalign.align_coverage_min\n+    -s $advanced.advalign.cutoff_diff_len\n+    -S $advanced.advalign.aa_cutoff_diff_len\n+#end if\n+-uL $advanced.max_unmatched_per_l\n+-uS $advanced.max_unmatched_per_s\n+-U $advanced.max_unmatched_len\n+$advanced.accurate\n+$advanced.inram\n+#if $print_alnovl.print_alnovl_select == "yes":\n+    -p 1\n+    -d $print_alnovl.desclen\n+#end if\n+\n+## instead of 800 (default) we use 0:unlimited\n+-M \\${GALAXY_MEMORY_MB:-0}\n+-T \\${GALAXY_SLOTS:-1}\n+    ]]></command>\n+    <inputs>\n+        <param name="fasta_in" argument="-i" type="data" format="fasta" label="Sequences to cluster/compare"/>\n+        <conditional name="twod">\n+            <param name="twod_select" type="select" label="Cluster / Compare (i.e. call cd-hit[-est] / cd-hit[-est]-2d)?">\n+                <option value="" selected="true">Cluster sequences</option>\n+                <option value="-2d">Compare with 2nd sequence data set</option>\n+            </param>\n+            <when value=""/>\n+            <when value="-2d">\n+                <param name="fasta_in2" argument="-i2" type="data" format="fasta" label="Other sequences to cluster/compare"/>\n+            </when>\n+        </conditional>\n+        <conditional name="est">\n+            <param name="est_select" type="select" label="Sequence type?" help="For nucleotides the -est variant of cd-hit is called">\n+                <option value="" selected="true">Protein</option>\n+                <option value="-est">Nucleotides</option>\n+            </param>\n+            <when value="">\n+                <param name="similarity" argument="-c" type="float" min="0.4" max="1.0" value="0.9" label="Sequence identity threshold" help="Global sequence identity: number of identical alignment positions divided by the full length of the shorter sequence"/>\n+                <param name="wordsize" argument="-n" type="integer" min="2" max="5" value="5" label="Word size">\n+                    <help>Suggested word size:\n+            5 for thresholds 0.7 ~ 1.0\n+            4 for thresholds 0.6 ~ 0.7\n+            3 for thresholds 0.5 ~ 0.6\n+            2 for thresholds 0.4 ~ 0.5 (-n)\n+                    </help>\n+                </param>\n+                <param name="redtol" argument="-t" type="integer" value="2" label="Tolerance for redundance"/>\n+            </when>\n+            <when value="-est">\n+                <param name="similarity" argument="-c" type="float" min="0.8" max="1.0" value="0.9" label="Sequence identity threshold" help="Global sequence identity: number of identical alignment positions div'..b'al name="twod">\n+                <param name="twod_select" value="" />\n+            </conditional>\n+            <conditional name="est">\n+                <param name="est_select" value="-est" />\n+                <param name="wordsize" value="8"/>\n+                <param name="strand" value="false"/>\n+            </conditional>\n+            <param name="similarity" value="0.9"/>\n+            <output name="clusters_out" file="est_clusters_output.txt"/>\n+            <output name="fasta_out" file="est_fasta_output.fasta"/>\n+        </test>\n+        <!-- cd-hit-est-2d (also changing strand param) -->\n+        <test>\n+            <param name="fasta_in" value="db1.fasta" />\n+            <conditional name="twod">\n+                <param name="twod_select" value="-2d" />\n+                <param name="fasta_in2" value="db2.fasta" />\n+            </conditional>\n+            <conditional name="est">\n+                <param name="est_select" value="-est" />\n+                <param name="wordsize" value="8"/>\n+                <param name="strand" value="true"/>\n+            </conditional>\n+            <param name="similarity" value="0.9"/>\n+            <output name="clusters_out" file="est-2d.txt.clstr"/>\n+            <output name="fasta_out" file="est-2d.txt"/>\n+        </test>\n+    </tests>\n+    <help><![CDATA[\n+**What it does**\n+\n+cd-hit stands for Cluster Database at High Identity with Tolerance. The tool implements four variants: cd-hit, cd-hit-est, cd-hit-2d, and cd-hit-est-2d.\n+\n+The program cd-hit (resp. cd-hit-est) takes a FASTA format aminoacid (resp. nucleotide) sequence database as input and produces a set of \'non-redundant\' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the members of the sequence clusters for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing \'redundant\' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit (resp. cd-hit-est) produces a set of closely related protein (resp. nucleotide sequence) families from a given FASTA sequence database.\n+\n+The program cd-hit-2d (resp. cd-hit-est-2d) compares two aminoacid (resp. nucleotide) sequence datasets (db1, db2) in FASTA format. It identifies the sequences in db2 that are similar to db1 at a certain threshold. It outputs two files: a FASTA file of sequences in db2 that are not similar to db1 and a text file that lists similar sequences between db1 & db2.\n+\n+.. _CD-HIT: http://weizhongli-lab.org/cd-hit/\n+\n+------\n+\n+**Inputs**\n+\n+cd-hit/cd-hit-2d requires a (two) protein FASTA file(s) as input.\n+\n+cd-hit-est/cd-hit-est-2d requires a (two) nucleotide FASTA file(s) as input.\n+\n+------\n+\n+**Outputs**\n+\n+For cd-hit and cd-hit-est:\n+\n+1. The first output is a FASTA file containing representative sequences.\n+\n+2. The second output is a text file listing the mapping of sequences to the representative sequences:\n+\n+  >Cluster 0\n+  0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... *\n+  >Cluster 1\n+  0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80%\n+  1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84%\n+  2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... *\n+  3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84%\n+  4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63%\n+  >Cluster 2\n+  0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60%\n+  1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... *\n+  2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73%\n+  3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69%\n+\n+For cd-hit-2d and cd-hit-est-2d:\n+\n+1. The first output is a FASTA file of sequences in db2 that are not similar to db1.\n+\n+2. The second output is a text file that lists similar sequences between db1 & db2\n+    ]]></help>\n+    <citations>\n+        <citation type="doi">10.1093/bioinformatics/btl158</citation>\n+        <citation type="doi">10.1093/bioinformatics/bts565</citation>\n+    </citations>\n+</tool>\n'
b
diff -r 000000000000 -r e0da3400ac2f test-data/cd_hit_est_in.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cd_hit_est_in.fa Mon Oct 15 10:54:50 2018 -0400
b
b'@@ -0,0 +1,74 @@\n+>F12Fcsw_481739\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGACGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F14Fcsw_133982\n+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGCCCAA\n+>F14Fcsw_149685\n+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F14Fcsw_175165\n+CGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGATAGTAGGCGGGGTAACGGCCCACCTAGTCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F14Fcsw_176364\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGCCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACGCGGTCCAA\n+>F14Fcsw_224425\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTAAGACACGGTCCAA\n+>F14Fcsw_27361\n+CGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F14Fcsw_2745\n+GACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F14Fcsw_37069\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGCCCAG\n+>F14Fcsw_38031\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCAA\n+>F14Fcsw_49588\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACGCGGTCCAA\n+>F22Fcsw_400293\n+CCGGCGCACGGGTGAGTAACACGTATCCAACCTTCCGTACACTCAGGGATAGCCTTTCGAAAGAAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGGATTTCGGTCATCGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGGCAACGATCAGTAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F23Fcsw_133990\n+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCGACAACACTGGGATAGCCTTTCGAAAGAAAGATTAATACCGGATGGCATAGTTTTCCCGCATGGAAAAACTATTAAAGAATTTCGGTTATCGATGGGGATGCGTTCCATTAGGCAGTTGGCGGGGTAACGGCCCACCAAACCGACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F23Fcsw_160873\n+CGGGTGAGTAACGCGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGGTAGTAGGCGGGGTAACGGCCCACCTAGCCAACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F23Fcsw_86009\n+GGCGACCGGCGCACGGGTGAGTAACGCGTATCCAACCTTCCGTACACTCAGGGATAGCCTTTCGAAAGAAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGGATTTCGGTCATCGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGGCAACGATCAGTAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F23Fcsw_96640\n+TCCGTACACTCAGGGATAGCCTTTCGAAAGAAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGAATTTCGGTCATCGATGGGGATGCGT'..b'GGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M13Fcsw_127764\n+GACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACGACGATGCGTAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGTACTGAGACACGGACCAA\n+>M13Fcsw_128004\n+CGGACGGGTGAGTAACGCGTGAGTAACCTGCCGATAACTCAGGGATAGCCTTTCGAAAGAAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGAATTTCGGTCATCGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGGCGACGATGCGTAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGTACTGAGACACGGACCAA\n+>M13Fcsw_198303\n+TAACACGTATCCAACCTGCCTCATACTCGGGGATAACCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGATAGTAGGCGGGGTAACGGCCCACCTAGTCAACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M14Fcsw_117325\n+GTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGATAGTAGGCGGGGTAACGGCCCACCTAGTCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M14Fcsw_151062\n+CGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M14Fcsw_181677\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGATAGTAGGCGGGGTAACGGCCCACCTAGTCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M14Fcsw_186607\n+GACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGGTTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M24Fcsw_136217\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTCTGATTAGCTTGTTGGCGGGGTAACGGCCCACCAAGGCACCGATCAGTAGGGGTTCTGAGAGGAAGGTCCCCCACATAGGAACTGAGACACGGTCCTA\n+>M41Fcsw_259146\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTACCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGCCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M42Fcsw_137216\n+CAACCTACCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M42Fcsw_138199\n+GACCGGCGCACGGGTGAGTAACACGTATCCAACCTACCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M42Fcsw_225418\n+CACGTATCCAACCTGCCGTCTACTCTTGGACAGCCTTCTGAAAGGAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGGATTTCGGTCATCGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGCCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCACATTGGAACTGAGACACGGTCCAA\n+>M42Fcsw_263016\n+ACCTACCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCCA\n+>M42Fcsw_334979\n+GGGATAGCCGCCCGAAAGGACGGGTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGGATTTCGGTCATCGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGCCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M43Fcsw_250770\n+GTATCCAACCTGCCGTCTACTCTTGGACAGCCTTCTGAAAGAAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGGATTTCGGTCATCGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCTAGTCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M44Fcsw_200453\n+CTAGTGGCGGACGGGTGAGTAACGCGTATCCAACCTGCCGATGACTCGGGGATAGCCTTTCGAAAGAAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGAACTTCGGTCATCGATGGGGATGCGTTCCATTAGATAGTAGGCGGGGTAACGGCCCACCTAGTCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n'
b
diff -r 000000000000 -r e0da3400ac2f test-data/cd_hit_protein_in.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cd_hit_protein_in.fasta Mon Oct 15 10:54:50 2018 -0400
b
@@ -0,0 +1,50 @@
+>sp|P00325|ADH1B_HUMAN Alcohol dehydrogenase 1B OS=Homo sapiens GN=ADH1B PE=1 SV=2
+MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT
+PLPVILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP
+RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG
+YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ
+NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF
+DLLHSGKSIRTVLTF
+>tr|K7D361|K7D361_PANTR Alcohol dehydrogenase 1B (Class I), beta polypeptide OS=Pan troglodytes GN=ADH1B PE=2 SV=1
+MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT
+PLPAILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP
+RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG
+YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ
+NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF
+DLLHSGKSIRTVLTF
+>sp|P00329|ADH1_MOUSE Alcohol dehydrogenase 1 OS=Mus musculus GN=Adh1 PE=2 SV=2
+MSTAGKVIKCKAAVLWELHKPFTIEDIEVAPPKAHEVRIKMVATGVCRSDDHVVSGTLVT
+PLPAVLGHEGAGIVESVGEGVTCVKPGDKVIPLFSPQCGECRICKHPESNFCSRSDLLMP
+RGTLREGTSRFSCKGKQIHNFISTSTFSQYTVVDDIAVAKIDGASPLDKVCLIGCGFSTG
+YGSAVKVAKVTPGSTCAVFGLGGVGLSVIIGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYSKPIQEVLQEMTDGGVDFSFEVIGRLDTMTSALLSCHAACGVSVVVGVPPNAQ
+NLSMNPMLLLLGRTWKGAIFGGFKSKDSVPKLVADFMAKKFPLDPLITHVLPFEKINEAF
+DLLRSGKSIRTVLTF
+>sp|P00338-2|LDHA_HUMAN Isoform 2 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA
+MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKECRYTLGDPKGA
+AILKSSDVISFHCLGYNRILGGGCACCPFYLICD
+>sp|P00338-5|LDHA_HUMAN Isoform 5 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA
+MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKEVHKQVVERVFT
+E
+>sp|P00340|LDHA_CHICK L-lactate dehydrogenase A chain OS=Gallus gallus GN=LDHA PE=1 SV=3
+MSLKDHLIHNVHKEEHAHAHNKISVVGVGAVGMACAISILMKDLADELTLVDVVEDKLKG
+EMLDLQHGSLFLKTPKIISGKDYSVTAHSKLVIVTAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPDCKLLIVSNPVDILTYVAWKISGFPKHRVIGSGCNLDSARFRHLMGERLGI
+HPLSCHGWIVGEHGDSSVPVWSGVNVAGVSLKALHPDMGTDADKEHWKEVHKQVVDSAYE
+VIKLKGYTSWAIGLSVADLAETIMKNLRRVHPISTAVKGMHGIKDDVFLSVPCVLGSSGI
+TDVVKMILKPDEEEKIKKSADTLWGIQKELQF
+>sp|P19858|LDHA_BOVIN L-lactate dehydrogenase A chain OS=Bos taurus GN=LDHA PE=2 SV=2
+MATLKDQLIQNLLKEEHVPQNKITIVGVGAVGMACAISILMKDLADEVALVDVMEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSRLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNIVKYSPNCKLLVVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWILGEHGDSSVPVWSGVNVAGVSLKNLHPELGTDADKEQWKAVHKQVVDSAYE
+VIKLKGYTSWAIGLSVADLAESIMKNLRRVHPISTMIKGLYGIKEDVFLSVPCILGQNGI
+SDVVKVTLTHEEEACLKKSADTLWGIQKELQF
b
diff -r 000000000000 -r e0da3400ac2f test-data/db1.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/db1.fasta Mon Oct 15 10:54:50 2018 -0400
b
b'@@ -0,0 +1,598 @@\n+>MISEQ-1639_50_000000000-ATW8R_1_1101_2470_14480_TTGACACCTCTTTCCC  \n+aacgcacattgcgcctcgaggcattcctcgaggcatgcctgtttgagcgtcgcatcccctctaacccccggttaggcgttgctccgaaatatcaaccgcgctgtcaaacacgtttacagcacgacatttcgccctcaaatcaggtaggactacccgctgaacttaa\n+>MISEQ-1639_51_000000000-ATW7A_1_2112_20171_24879_TTGACACC  \n+aacgcacattgcgccccttggtattccgaggggcatgcctgttcgagcgtcattacaccactcaagctatgcttggtattgggcgtcgtccttagttgggcgcgccttaaagacctcggcgaggccactccggctttaggcgtagtagaatttattcgaacgtctgtcaaaggagaggaactctgccgactgaaacctttatttttctaggttgacctcggatcaggtagggatacccgctgaacttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_1102_3280_9288_TACCGTAG  \n+aacgcaaattgcactctcgtcttcgcgacaagagtttgcctctttcagcatcggttctatttgttttcttaccttgtgtggggaaacgaaagtgagagttatcttcaccggaagatatctctttaagtgttggagaaaacaaaggttcatttcttgttttcttgtgtgataattcgcttcggcgattctccaagatgacttaaaatgtttccaatgtatctcgtctaaagttgatgtgtctcgaaagagatgccaaaacacaaccattggttgtcttttttgatccgatctgaaatgaagcaaggttacccgctgaacttaa\n+>MISEQ-1639_51_000000000-ATW7A_1_2101_4746_9076_TTGACACC  \n+aacgcacattgcgccccttggtattccggggggcatgcctgttcgagcgtcatttcaaccctcaagcttagcttggtattgagtctatgtcagtaatggcaggctctaaaatcagtggcggcgccgctgggtcctgaacgtagtaatatctctcgttacaggttctcggtgtgcttctgccaaaacccaaatttttctatggttgacctcggatcaggtagggatacccgctgaacttaa\n+>MISEQ-1639_81_000000000-B4L4B_1_2109_25031_20953_GTCGATAC  \n+aacgcacattgcgccctgtggtattccacagggcatgcctgtttgagcgtcatttctccctcaaacctctggtttggcgttgagtgatactcggtttacttgaaaaacatgaaaagcataactattaggttttaccaactcgttatactaatctacaagtttgacctcaaatcaggtaggactacccgctgaacttaa\n+>MISEQ-M4070_81_000000000-B4JFH_1_2104_13788_3780_TATGCCAC  \n+aacgcaagttgcggtcttcttacgaagaccatgtttgcctgagggttgttaacacatcaaattgaggttgtgtgtttcttaattgaagcactcccttgaaatctagtttgaggccaagtttacttggttcaatcttgttattgaatccatatgtaagtaagcaatttgcttgcttgcaacctcgaaccatattttcacaaataagggtttaatagtgagatcttcggatcttgcttccttaactaagcacttattgaaggagaatgtgaattcaaggaatggtttttcagatattgagactttcttcactaacgtgtaaagaagtatcaatatgcataattcaaatcgcaacctcagatcaagcaagactacccgctgaacttaa\n+>MISEQ-1639_50_000000000-ATW8R_1_1102_17771_4855_TTGACACCTCTTTCCC  \n+aacgcaacttgcgctgtcgagtaaccctcttcagcatgcctccttgagcatcgtttgtgcacccccttccgggctcttctccttgccctcgagccgaccttttcctaaaagtcacctcgggggtcgaatcgagcggtcccggttgctagtccccagcccccacagagcgtgccctgtcacacctagtgtgccctggccttcttgcgggccggaccggcgaccctcagtaacggaccttcaaatgaccaaaccgcacgatctcaagtgaggcaaggctacccgccgaacttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_1104_11204_22301_TACCGTAG  \n+aacgcacattgcgcccgccagtattctggcgggcatgcctgtccgagcgtcatttcaaccatcaagcccccggcttgtgttggggacctgcggcacacccgcaggccctgaaaaccagtggcgggctcgctgtccacaccgagcgtagtagcatctttgtctcgctcagggcgtggggcgggttccggccgtgaaagccaccttctcaaggtacccaaaggttgacctcggatcaggtaggaagacccgctgaacttaa\n+>MISEQ-1639_50_000000000-ATW8R_1_1101_2660_12531_TTGACACCTCTTTCCC  \n+aacgcatcttgcgccttttggtattccaaaaggcacacctgtttgagtgtcatgaaaccctctcattaaacattttttaacttgtttaaacttgtttagtggatgttgagtgttgctgtcattagctcactttaaatatataagtcacttttcaaataagttggattgacttggtgtaataattttatcatcacatcaaggaaagtagcaatacttgccatcttgtttaatataagggacttctaaaaaccccccttttttcaaaatttaagacctcaaatcaggtgggactacccgctgaacttaa\n+>MISEQ-1639_50_000000000-ATW8R_1_1101_16274_21582_GTATTGGCTCTTTCCC  \n+aacgcacattgcgcccgccagcattctggcgggcatgcctgttcgagcgtcatttcaaccctcgacctccctttggggaagtcggcgttggggaccggcagcacaccgccggccctgaaatggagtggcggcccgtccgcggcgacctctgcgtagtaaaccaactcgcaccggaaccccgacgtggccacgccgtaaaacacccaacttctgaacgttgacctcgaatcaggtaggactacccgctgaacttaa\n+>MISEQ-1639_51_000000000-ATW7A_1_1116_18951_12440_AAGTCGGA  \n+aacgcacattgcgccctctggtattccggagggcatgcctgtttgagcgtcgtttctccctcaaaccgctgggtttggtgttgagcaatacgacttgggtttgcttgaaagacggtagtggtaaggcgggatcgctttgacaatggcttaggtctaaccaaaaacattgcttgcggcggtaacgtccaccacgtatatcttcaaactttgacctcaaatcaggtaggactacccgctgaacttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_1107_6648_2646_TACCGTAG  \n+aacgcaccttgcgctccatggcattccgtggagcatgcctgtttgagtgccgcgaaatctcccacccctagcggttgccgcaaggcgccgccgggcggcggggttggttgggtgccgctgcctgggctttggcccaggctcgcccgaaatgcatgagcgcttagcactcgcaccgtctcgagggaaggcggcggagctggcatctgcgcatggcatgatacgtcatttgctgtgtggcagcccagcgacccgagagtgtgtgtgcgctactac'..b'ctctacaatgacggtagcattcaacccagttgaatgattcacttgtcaaaactatatactgattctgatctgaaatgaggcaaggttacccgctgaacttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_1104_22194_4139_TACCGTAG  \n+aacgcacattgcgccctttggtattccaaagggcatgcctgttcgagcgtcatttgtaccctcaagctttgcttggtgttgggcgtcttttgtctccagctcgctggagactcgccttaaagtcattggcagccggcctactggtttcggagcgcagcacaagtcgcgctcttcttccagccaaggtcagcgtccagcaagcctttttttcaacctttgacctcggatcaggtagggatacccgctgaacttaa\n+>MISEQ-1639_50_000000000-ATW8R_1_1114_4409_8537_GTATTGGCTCTTTCCC  \n+catatcaataagcggaggaagtactgtatacagtactgtgaatcatcgaatcttt\n+>MISEQ-1639_50_000000000-ATW8R_1_2113_18246_13747_GTATTGGCTCTTTCCC  \n+aacgcacattgcgccccctggtattccggggggcatgcctgtccgagcgtcattacaaccctcaagctcagcttggtgttgggccccgccgccccggcgggccctaaagtcagtggcggtgccgtccggctccgagcgtagtaattcttctcgctctggaggtccggtcgtgtgcttgccagcaacccccaatttttttcaggttgacctcggatcaggtagggatacccgctgaacttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_1104_20416_21905_TACCGTAG  \n+aacgcaaatggcgctcttcggctcgcccggagagcaggcctccttcagcgtcgtttcatctctcactcactaacctgagtggattatggagtgtcttttttgcgcaagcgacaaagacctcttaaggaatagttgacttgtggttttgacagtgcttcggcattgtaaagactaaccacgaacaagtttcagatgttaatgtttactctccaagagttagttcacctttgtgtgatccaaacttggatgactaactacgcattttggtgtgcagttggtgatcgcatacaattattctaacattcgacctgaagtgaagtcggaggacccgctgaacttaa\n+>MISEQ-1639_50_000000000-ATW8R_1_1114_17108_6350_GTATTGGCTCTTTCCC  \n+aacgcaaatggcacctatgcttgcataggtacatctatttgagtgttggtttttaccatcaacctcaaattagatgagagtacccgctgaatttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_1106_14470_15263_TACCGTAG  \n+aacgcaccttgcgctccttggtattccgaggagcatgcctgtttgagcgtcgtgaatctctcaacccacaaattgtttgttcagtttgtgcggcttggacttggaggcattgctggcgatggtcagctcctctcaaatgtattagctgggcttagctcggtggaacggttagtgtgatatcactttgaatctgacctcaaatcaggtaggactacccgctgaacttaa\n+>MISEQ-M4070_81_000000000-B4JFH_1_2103_9975_10375_TACCGTAG  \n+aacgcaaatggcgccgtcggcacatgccggcggcatgtctgcttgagaatcagtaaccaaaacaaaccactggatcaccggggggcgggcgaagtttgttcttccccttcttcaggtggagcagcgtgggctgtcaggggaggggcgctcgcgcctcactcttgtcgcctgaaatgagcgctcacaatgtgcgcaggcaacgcaaagtgtgcgtgtgggtgcttggcatgagggccggcgtggcacggaacttcttgttcttgttcgctcgctgctcggccaggcgcacaggcgcgcgcaccgtcgcggtgcattttgcgcacagaaatcatgcagatgatctcaagtcaggcaagggtacccgctgaatttaa\n+>MISEQ-1639_50_000000000-ATW8R_1_1106_4432_6824_GTATTGGCTCTTTCCC  \n+catatcaataagcggaggaactgtagtacactacagtgtgagtcatcgaatcttt\n+>MISEQ-1639_78_000000000-B3PNL_1_1106_14590_19880_ATGCATGG  \n+aacgcacattgcactgtctggaactccgggcagtatacctgtttgagtatcagctactactctcactatcagtcactttggtggtctagtgaggacttgagttatcatggactaccctcgggaagactgtgcgactttaaatatgtaccagcgtccctaggatcgctgtttccttgcagagttgatggccttgtgcctaatcctgttagggtagcaaatcgtatacctagtggttaaccgtagctggctaaaaaccatacagtggtcagtagtaactggccgactgtgactttgatctcaaatcaggtaagattacccgctgaacttaa\n+>MISEQ-1639_51_000000000-ATW7A_1_1109_14958_23601_ACGAGAGA  \n+aacgcaccttgcgctccttggtattccgaggagcatgcctgtttgagtgtcatgaaattctcaacctaacaagttcttaaccggacttgcttaggcttggacttggaggcttgtcggctcttagcagtcggctcctctcaaatgcattagctttggttccttgcggatcggctcacggtgtgataattgtctacgccgcgaccgttgaagcgttttaatggccagcttctaatcgtctcttgcgaacctcaaatcaggtaggactacccgctgaacttaa\n+>MISEQ-1639_50_000000000-ATW8R_1_2105_6505_5183_TGTTAGGCTCTTTCCC  \n+aacgcaccttgcgctccttggtattccgaggagcatgcctgtttgagtgtcatgaaactctcaaacccaagttttggatttcgatccatgcttgagtttggatttggatgtttgccggtgataagccgactcatcttaaaagtattagctagatctgtctctatgactggtttgacttggcataataagtattttgctgaggacatcttcggatggccaggacctagactattgtatgctaactaaaccatcacttaatgtgtaacttcgacatctggcctcaaatcaggtaggactacccgctgaacttaa\n+>MISEQ-1639_51_000000000-ATW7A_1_2102_25721_20172_AAGTCGGA  \n+aacgcacattgcgcccgccagcattctggcgggcatgcctgttcgagcgtcatttcaaccctcgagcccccccgggggcctcggtgttgggggacggcacaccagccgcccccgaaatgcagtggcgaccccgccgcagcctcccctgcgtagtagcacacacctcgcaccggagcgcggaggcggtcacgccgtaaaacgcccaactttcttagagttgacctcggatcaggtaggaatacccgttgaacttaagtggctccccccagcattttggctgttattccttttcgagcgtcatttcaaccctcgagcccccccgggggcctcggtgttgggggacggcacgccagccgcccccgaaatgcagtggcgaccccgccgcagcctcccctgcgtagtagcacacacctcgcaccggagcgcggaggcggtcacgccgtaaagcgcccaactttcttagagttgacctcggatcaggtaggaatacccgctgaacttaa\n\\ No newline at end of file\n'
b
diff -r 000000000000 -r e0da3400ac2f test-data/db2.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/db2.fasta Mon Oct 15 10:54:50 2018 -0400
b
b'@@ -0,0 +1,560 @@\n+>MISEQ-1639_50_000000000-ATW8R_1_1101_2470_14480_TTGACACCTCTTTCCC  \n+aacgcacattgcgcctcgaggcattcctcgaggcatgcctgtttgagcgtcgcatcccctctaacccccggttaggcgttgctccgaaatatcaaccgcgctgtcaaacacgtttacagcacgacatttcgccctcaaatcaggtaggactacccgctgaacttaa\n+>MISEQ-1639_51_000000000-ATW7A_1_2112_20171_24879_TTGACACC  \n+aacgcacattgcgccccttggtattccgaggggcatgcctgttcgagcgtcattacaccactcaagctatgcttggtattgggcgtcgtccttagttgggcgcgccttaaagacctcggcgaggccactccggctttaggcgtagtagaatttattcgaacgtctgtcaaaggagaggaactctgccgactgaaacctttatttttctaggttgacctcggatcaggtagggatacccgctgaacttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_1102_3280_9288_TACCGTAG  \n+aacgcaaattgcactctcgtcttcgcgacaagagtttgcctctttcagcatcggttctatttgttttcttaccttgtgtggggaaacgaaagtgagagttatcttcaccggaagatatctctttaagtgttggagaaaacaaaggttcatttcttgttttcttgtgtgataattcgcttcggcgattctccaagatgacttaaaatgtttccaatgtatctcgtctaaagttgatgtgtctcgaaagagatgccaaaacacaaccattggttgtcttttttgatccgatctgaaatgaagcaaggttacccgctgaacttaa\n+>MISEQ-1639_51_000000000-ATW7A_1_2101_4746_9076_TTGACACC  \n+aacgcacattgcgccccttggtattccggggggcatgcctgttcgagcgtcatttcaaccctcaagcttagcttggtattgagtctatgtcagtaatggcaggctctaaaatcagtggcggcgccgctgggtcctgaacgtagtaatatctctcgttacaggttctcggtgtgcttctgccaaaacccaaatttttctatggttgacctcggatcaggtagggatacccgctgaacttaa\n+>MISEQ-1639_81_000000000-B4L4B_1_2109_25031_20953_GTCGATAC  \n+aacgcacattgcgccctgtggtattccacagggcatgcctgtttgagcgtcatttctccctcaaacctctggtttggcgttgagtgatactcggtttacttgaaaaacatgaaaagcataactattaggttttaccaactcgttatactaatctacaagtttgacctcaaatcaggtaggactacccgctgaacttaa\n+>MISEQ-M4070_81_000000000-B4JFH_1_2104_13788_3780_TATGCCAC  \n+aacgcaagttgcggtcttcttacgaagaccatgtttgcctgagggttgttaacacatcaaattgaggttgtgtgtttcttaattgaagcactcccttgaaatctagtttgaggccaagtttacttggttcaatcttgttattgaatccatatgtaagtaagcaatttgcttgcttgcaacctcgaaccatattttcacaaataagggtttaatagtgagatcttcggatcttgcttccttaactaagcacttattgaaggagaatgtgaattcaaggaatggtttttcagatattgagactttcttcactaacgtgtaaagaagtatcaatatgcataattcaaatcgcaacctcagatcaagcaagactacccgctgaacttaa\n+>MISEQ-1639_50_000000000-ATW8R_1_1102_17771_4855_TTGACACCTCTTTCCC  \n+aacgcaacttgcgctgtcgagtaaccctcttcagcatgcctccttgagcatcgtttgtgcacccccttccgggctcttctccttgccctcgagccgaccttttcctaaaagtcacctcgggggtcgaatcgagcggtcccggttgctagtccccagcccccacagagcgtgccctgtcacacctagtgtgccctggccttcttgcgggccggaccggcgaccctcagtaacggaccttcaaatgaccaaaccgcacgatctcaagtgaggcaaggctacccgccgaacttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_1104_11204_22301_TACCGTAG  \n+aacgcacattgcgcccgccagtattctggcgggcatgcctgtccgagcgtcatttcaaccatcaagcccccggcttgtgttggggacctgcggcacacccgcaggccctgaaaaccagtggcgggctcgctgtccacaccgagcgtagtagcatctttgtctcgctcagggcgtggggcgggttccggccgtgaaagccaccttctcaaggtacccaaaggttgacctcggatcaggtaggaagacccgctgaacttaa\n+>MISEQ-1639_50_000000000-ATW8R_1_1101_2660_12531_TTGACACCTCTTTCCC  \n+aacgcatcttgcgccttttggtattccaaaaggcacacctgtttgagtgtcatgaaaccctctcattaaacattttttaacttgtttaaacttgtttagtggatgttgagtgttgctgtcattagctcactttaaatatataagtcacttttcaaataagttggattgacttggtgtaataattttatcatcacatcaaggaaagtagcaatacttgccatcttgtttaatataagggacttctaaaaaccccccttttttcaaaatttaagacctcaaatcaggtgggactacccgctgaacttaa\n+>MISEQ-1639_50_000000000-ATW8R_1_1101_16274_21582_GTATTGGCTCTTTCCC  \n+aacgcacattgcgcccgccagcattctggcgggcatgcctgttcgagcgtcatttcaaccctcgacctccctttggggaagtcggcgttggggaccggcagcacaccgccggccctgaaatggagtggcggcccgtccgcggcgacctctgcgtagtaaaccaactcgcaccggaaccccgacgtggccacgccgtaaaacacccaacttctgaacgttgacctcgaatcaggtaggactacccgctgaacttaa\n+>MISEQ-1639_51_000000000-ATW7A_1_1116_18951_12440_AAGTCGGA  \n+aacgcacattgcgccctctggtattccggagggcatgcctgtttgagcgtcgtttctccctcaaaccgctgggtttggtgttgagcaatacgacttgggtttgcttgaaagacggtagtggtaaggcgggatcgctttgacaatggcttaggtctaaccaaaaacattgcttgcggcggtaacgtccaccacgtatatcttcaaactttgacctcaaatcaggtaggactacccgctgaacttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_1107_6648_2646_TACCGTAG  \n+aacgcaccttgcgctccatggcattccgtggagcatgcctgtttgagtgccgcgaaatctcccacccctagcggttgccgcaaggcgccgccgggcggcggggttggttgggtgccgctgcctgggctttggcccaggctcgcccgaaatgcatgagcgcttagcactcgcaccgtctcgagggaaggcggcggagctggcatctgcgcatggcatgatacgtcatttgctgtgtggcagcccagcgacccgagagtgtgtgtgcgctactac'..b'CCC  \n+aacgcacattgcgccctttggtattccgaagggcatgcctgttcgagcgtcattttcacccctcaagcccccggcttggtgttggacggtttggtccagggccccccctggacccctcccaaagacaatgacggcgggctgttgcacccccggtacactgagcatcttcacggagcacgtaccggtctcaagggtcgacggcacccggtctacacctatatctttcacaaggttgacctcggatcaggtaggaatacccgctgaacttaagccctttggtattccgaagggcatgcctgttcgagcgtcattttcacccctcaagcccccggcttggtgttggacggtttggtccagggccccccctggacccctcccaaagacaatgacggcgggctgttgcacccccggtacactgagcatcttcacggagcacgtaccggtctcaagggtcgacggcacccggtctacacctatatctttcacaaggttgacctcggatcaggtaggaatacccgctgaacttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_2115_16628_11115_TACCGTAG  \n+aacgcatattgcactcttgtcttcgcgatgagagtatacctctttcagtatcggttctatctcaatacttgaattagtattggaaaatgagagttattattcttatgtgggtaatatctcttgaattgttatgttgaagcgtccgtactcatcgggcactgattatatactgaacacttcaaatgttttagttcgcttttgtagcgattcactcatttgcatttgtgtgtttctgtgtgtgatcaatacctgtaaaaatgtatgttggcttcctctacaatgacggtagcattcaacccagttgaatgattcacttgtcaaaactatatactgattctgatctgaaatgaggcaaggttacccgctgaacttaa\n+>MISEQ-1639_50_000000000-ATW8R_1_1113_22185_9959_AAGTCGGATCTTTCCC  \n+aacgcatattgcgctctttggtattccgaagagcatgcttgtttgagtatcagtaaacacctcaaagacttcaatttgttttgaatggctttggacttgagcaatcccaacaccagtctttacgatcggtggggggttgcttgaaatgcaggtgcagctggacattctcctgagctaaaagcatatttatttagtcccgtcaaacggattattacttttgctgcagctaacataaagggagtttgaccatattcgatgacagatccaggaagactacccgctgaacttaa\n+>MISEQ-1639_51_000000000-ATW7A_1_1108_23183_16804_AATCGACC  \n+aacgcatattgcgctctttggtattccgaagagcatgcttgtttgagtatcagtaaacacctcaaagacttcaatttgttttgaatggctttggacttgagcaatcccaacaccagtctttacgatcggtggagggttgcttgaaatgcaggtgcagctggacattctcctgagctaaaagcatatttatttagtcccgtcaaacggattattacttttgctgcagctaacataaagtgagtttgataaaactcgatcacaaatgaaggaagactactcgctgaacttaa\n+>MISEQ-1639_51_000000000-ATW7A_1_2112_12797_24728_AATCGACT  \n+aacgcatattgcgctctttggtattccgaagagcatgcttgtttgagtatcagtaaacacctcaaagacttcaatttgttttgaatggctttggacttgagcaatcccaacaccagtctttacgatcggtggagggttgcttgaaatgtaggtgcagctggacattctcctgagctaaaaccatatttatttagtcccgtcaaacggattattacttttgctgcagctaaaataaagggagtttgataaaactcgatgactgatcaagtaagaatacccgctgaacttaat\n+>MISEQ-1639_51_000000000-ATW7A_1_1116_25567_9198_ACGAGAGA  \n+aacgcatattgcgctctttggtattccgaagagcatgcttgattgagtatcagtaaacacctcaaagacttcaatttgttttgaatggctttggacttgagcaatcacaacaccagtctttacgatcggtggagggttgcttgaaatgcaggtgcagctggacattctcctgagctaaaagcatatttatttagtcccgtcaaacggattattacttttgctgcagctaacataaagggagtttgataaaactggctctctaatcaagtaagactacccgctgaacttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_1107_3603_14752_ATGCATGG  \n+aacgcatattgcgctctttggtattccgaagagcatgcttgtttgagtatcagtaaacacctcaaagacttcaatttgttttgaatggctttggacttgagcaatcccaacaccagtctttacgatcggtggagggttgcttgaaatgcaggtgcagctggacattctcctgagctaaaagaatatttatttagtccagtcaaacggattattacttttgctgcagctaccataaagggagtttgataaaactcgatctcagatccagtaagactacccgctgaacttaat\n+>MISEQ-1639_78_000000000-B3PNL_1_2104_19995_22755_ATGCATGG  \n+aacgcatattgcgctctttggtattccgaagagcatgcttgtttgagtatcagtaaacacctcaaagacttcaatttgttttgaatggctttggacttgagcaatcccaacaccagtctttacgatcggtggagggttgcttgaaatgcaggtgcagctggacattctcctgagctaaaagcatatttatttagtcccgtaaaacgggatctcaaatcaagtaagactacccgctgaacttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_1115_16169_21596_ATGCATGG  \n+aacgcatattgcgctctttggtattccgaagagcatgcttgtttgagtatcagtaaacacctcaaagacttcaatttgttttgaatggctttggacttgagcaatcccaacaccagtctttacgatcggtggagggttgcttgaaatgcaggtgcagctggacattctcctgagctaaaagcatatttatttagtcccgtcaaacggattattacttttgctgcagctaacataaagggagattgaaaatcaagtaagactgcccgctgaacttaa\n+>MISEQ-1639_78_000000000-B3PNL_1_2118_17053_20751_ATGCATGG  \n+aacgcacattgcgcccgccagcattctggcgggcatgcctgttcgagcgtcatttcaaccctcgacttccctttggggaaatcggcgttggggaccggccgtataccgccggccccgaaatgaggtggcggcccgtccgcggcgacctctgcgtagaaatccaactcgcaccggaaccccgacgtggccacgccgtaaaacccccgccttctgaacgttgacctcggatcaggtaggaatccccgcagaacttaaccgccccttgcgcccgccagcattctgtcgggcgtgcctgttcgagcgtcatttcaaccctcgactgccctctggggaaatcggcgttggggaccggccgtataccgccggccccgaaatgaggtggcggcccgtccgcggcgacctctgcgtagaaatccaactcgcaccggaaccccgacgtggccacgccgtaaaacccccgacttctgaacgttgacctcggatcaggtaggaatacccgctgaacttaa\n\\ No newline at end of file\n'
b
diff -r 000000000000 -r e0da3400ac2f test-data/est-2d.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/est-2d.txt Mon Oct 15 10:54:50 2018 -0400
b
@@ -0,0 +1,18 @@
+>MISEQ-M4070_81_000000000-B4JFH_1_1119_8913_6810_TACCGTAG  
+aacgcacattgcgcccgccggcactccggcgggcatgcctgtccgagcgtcatttcaaccctcaggccctcccttccgggggacgggcctggtgttggggctcggggtccgcaaggcccccgtcccctaaaatcattggcggtcgcgctgtaatcccctgcgtagtaacacacctcgctctggagaccagcacgtccacgccgtaaaacccccaatttaccaaggttgacctcggatcaggtaggaatacccgctgaacttaa
+>MISEQ-1639_78_000000000-B3PNL_1_2112_16439_13482_TACCGTAG  
+aacgcacattgcgcccgccggtattccggcgggcatgcctgtccgagcgtcatttcaaccctcaggccctgcctggtgttggggcgccactgctgtgggcccccaaagccagcggcgggcacggcccggacccgagcgtagtagtataactcgctaggggctccggccgcgctccggccgtaaaacctcctataaaggttgacctcggatcaggtaggaatacccgctgaacttaa
+>MISEQ-1639_78_000000000-B3PNL_1_2108_13568_20621_TACCGTAG  
+aacgcacattgcacccggtggtattccgccgggtatgcctgttcgagcgtcatttccaccctcaggcccccctcctcacgggggggagggcctggtgttggggatctacgggcccttctggcgcccgtagtcccctaaattaatcggcgtgcgcgctgtacgttgccccctgcgtagtagtgatttctcgcatcgggccgtagcgctgtccagccgctaaactgcaactacgcatttagctaggtttgacctcgaatcaggtagggttacccgctgaacttaa
+>MISEQ-1639_78_000000000-B3PNL_1_2108_9997_12215_TACCGTAG  
+aacgcacattgcgcccggcagtaatctgccgggcatgcctgtccgagcgtcatttctgccctcgagcgagttccagcatctaggtgctggtgctcgcccggcgttggggcactacggtagagccctgtgctgtccgtaggccctgaaatgaagtggcggtcctgccgcggcgccccctgcgtagtataacagctcgcttcgggacccggtggaggccagccgtcaaacctatattctctaagtttgacctcggatcaggtagggttacccgctgaacttaa
+>MISEQ-1639_78_000000000-B3PNL_1_1103_22701_7420_TACCGTAG  
+aacgcacattgcgcttccgggatactcctgggagcatacctgtttcagtgtctgtttaaccctcacccgtggcaacacggtttgtggatgacgggcgtccgcgcgggcttcaccgccctgtggtcgcctcaggtttagcgacatgccgccgctctctgcctacgagcgtccgcgcgcatggtgtggtaggtttttctggcgccctcgcggcccgggatttcacaacccccatttcgccgcgttccagtttggccaggcccggtgacccgcgtgtccggctttgagtagccgaacccaaacaattggtcggtggggcgcgcgctcgtcgcgcaccccgcccactgcttttcggacctgaaatcaggcaagaaaacccgctgaatttaa
+>MISEQ-1639_78_000000000-B3PNL_1_1107_26593_19107_TACCGTAG  
+aacgcaaattgcactctcgtcttcgcgatgagagtttgcctctttcagtatcggtactactttctactcctttttgggaagagagaaaatgagagttatcgtacgatatctcttcaagtactagagaatacctttacaataaaacttgtattcttgtgtgatgattcgcttcggcgatttctacgagtttgcaatgtttttagtaaatggtattctcgccttttgttgaacctaccgcaaggtttggaacgaccgaatcaatgattcgattttttgatccgatctgaaatgaagcaaggttacccgctgaacttaa
+>MISEQ-1639_78_000000000-B3PNL_1_1113_18048_19103_TACCGTAG  
+aacgcacattgcgcccgctagtattctggcgggcatgcctgttcgagcgtcatttcaaccatcaagccccaggcttgcgttggagccctgcggctgccgcaggctcccaaatccagtggcgggctcgtcgtcgtaccgagtgcagtaaacatcctcgctcagggaacgcgtcgggttcttgccgtgaaaccccccctatatcaaggttgacctcggatcaggtaggaatacccgctgaacttaa
+>MISEQ-1639_50_000000000-ATW8R_1_1112_19897_18095_GTATTGGCTCTTTCCC  
+aacgcaacttgcgctgtcgagtaaccctcttcagcatgcctccttgagcatcgtttgtgcacccccttccgggaccgcacgatctcaagtgaggcaaggctacccgccgaacttaa
+>MISEQ-1639_78_000000000-B3PNL_1_2118_17053_20751_ATGCATGG  
+aacgcacattgcgcccgccagcattctggcgggcatgcctgttcgagcgtcatttcaaccctcgacttccctttggggaaatcggcgttggggaccggccgtataccgccggccccgaaatgaggtggcggcccgtccgcggcgacctctgcgtagaaatccaactcgcaccggaaccccgacgtggccacgccgtaaaacccccgccttctgaacgttgacctcggatcaggtaggaatccccgcagaacttaaccgccccttgcgcccgccagcattctgtcgggcgtgcctgttcgagcgtcatttcaaccctcgactgccctctggggaaatcggcgttggggaccggccgtataccgccggccccgaaatgaggtggcggcccgtccgcggcgacctctgcgtagaaatccaactcgcaccggaaccccgacgtggccacgccgtaaaacccccgacttctgaacgttgacctcggatcaggtaggaatacccgctgaacttaa
b
diff -r 000000000000 -r e0da3400ac2f test-data/est-2d.txt.clstr
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/est-2d.txt.clstr Mon Oct 15 10:54:50 2018 -0400
b
b'@@ -0,0 +1,868 @@\n+>Cluster 0\n+0\t528nt, >MISEQ-1639_50_00000... *\n+1\t528nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 1\n+0\t503nt, >MISEQ-1639_51_00000... *\n+>Cluster 2\n+0\t441nt, >MISEQ-1639_50_00000... *\n+1\t441nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 3\n+0\t397nt, >MISEQ-1639_78_00000... *\n+1\t397nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 4\n+0\t391nt, >MISEQ-1639_78_00000... *\n+1\t391nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 5\n+0\t389nt, >MISEQ-1639_78_00000... *\n+1\t389nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 6\n+0\t388nt, >MISEQ-M4070_81_0000... *\n+1\t388nt, >MISEQ-M4070_81_0000... at +/100.00%\n+>Cluster 7\n+0\t386nt, >MISEQ-1639_50_00000... *\n+1\t386nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 8\n+0\t385nt, >MISEQ-M4070_81_0000... *\n+1\t385nt, >MISEQ-M4070_81_0000... at +/100.00%\n+>Cluster 9\n+0\t374nt, >MISEQ-1639_78_00000... *\n+1\t374nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 10\n+0\t374nt, >MISEQ-M4070_81_0000... *\n+>Cluster 11\n+0\t371nt, >MISEQ-1639_78_00000... *\n+1\t371nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 12\n+0\t369nt, >MISEQ-1639_78_00000... *\n+1\t369nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 13\n+0\t366nt, >MISEQ-M4070_81_0000... *\n+1\t366nt, >MISEQ-M4070_81_0000... at +/100.00%\n+>Cluster 14\n+0\t365nt, >MISEQ-1639_78_00000... *\n+1\t365nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 15\n+0\t365nt, >MISEQ-1639_50_00000... *\n+1\t365nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 16\n+0\t364nt, >MISEQ-1639_78_00000... *\n+1\t364nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 17\n+0\t361nt, >MISEQ-1639_78_00000... *\n+1\t361nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 18\n+0\t351nt, >MISEQ-1639_78_00000... *\n+1\t351nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 19\n+0\t348nt, >MISEQ-1639_50_00000... *\n+1\t348nt, >MISEQ-1639_50_00000... at +/100.00%\n+2\t342nt, >MISEQ-1639_50_00000... at +/96.20%\n+>Cluster 20\n+0\t347nt, >MISEQ-1639_50_00000... *\n+>Cluster 21\n+0\t345nt, >MISEQ-1639_78_00000... *\n+1\t345nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 22\n+0\t345nt, >MISEQ-1639_78_00000... *\n+1\t345nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 23\n+0\t344nt, >MISEQ-1639_50_00000... *\n+1\t344nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 24\n+0\t342nt, >MISEQ-1639_50_00000... *\n+>Cluster 25\n+0\t341nt, >MISEQ-1639_78_00000... *\n+>Cluster 26\n+0\t337nt, >MISEQ-M4070_81_0000... *\n+1\t337nt, >MISEQ-M4070_81_0000... at +/100.00%\n+2\t329nt, >MISEQ-1639_51_00000... at +/93.01%\n+>Cluster 27\n+0\t334nt, >MISEQ-1639_78_00000... *\n+1\t334nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 28\n+0\t332nt, >MISEQ-1639_51_00000... *\n+1\t332nt, >MISEQ-1639_51_00000... at +/100.00%\n+>Cluster 29\n+0\t332nt, >MISEQ-1639_78_00000... *\n+1\t332nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 30\n+0\t329nt, >MISEQ-1639_51_00000... *\n+>Cluster 31\n+0\t329nt, >MISEQ-1639_50_00000... *\n+1\t329nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 32\n+0\t329nt, >MISEQ-1639_51_00000... *\n+1\t329nt, >MISEQ-1639_51_00000... at +/100.00%\n+>Cluster 33\n+0\t329nt, >MISEQ-1639_78_00000... *\n+1\t329nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 34\n+0\t329nt, >MISEQ-1639_50_00000... *\n+1\t329nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 35\n+0\t329nt, >MISEQ-1639_51_00000... *\n+1\t329nt, >MISEQ-1639_51_00000... at +/100.00%\n+>Cluster 36\n+0\t329nt, >MISEQ-1639_78_00000... *\n+>Cluster 37\n+0\t328nt, >MISEQ-1639_51_00000... *\n+1\t328nt, >MISEQ-1639_51_00000... at +/100.00%\n+>Cluster 38\n+0\t327nt, >MISEQ-M4070_81_0000... *\n+>Cluster 39\n+0\t327nt, >MISEQ-1639_50_00000... *\n+>Cluster 40\n+0\t327nt, >MISEQ-1639_51_00000... *\n+>Cluster 41\n+0\t326nt, >MISEQ-1639_50_00000... *\n+1\t326nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 42\n+0\t325nt, >MISEQ-1639_78_00000... *\n+1\t325nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 43\n+0\t323nt, >MISEQ-1639_51_00000... *\n+1\t323nt, >MISEQ-1639_51_00000... at +/100.00%\n+>Cluster 44\n+0\t323nt, >MISEQ-1639_51_00000... *\n+1\t323nt, >MISEQ-1639_51_00000... at +/100.00%\n+>Cluster 45\n+0\t322nt, >MISEQ-1639_78_00000... *\n'..b'Cluster 254\n+0\t165nt, >MISEQ-1639_50_00000... *\n+>Cluster 255\n+0\t160nt, >MISEQ-1639_50_00000... *\n+1\t160nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 256\n+0\t143nt, >MISEQ-1639_50_00000... *\n+1\t143nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 257\n+0\t129nt, >MISEQ-1639_50_00000... *\n+1\t129nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 258\n+0\t124nt, >MISEQ-1639_50_00000... *\n+1\t124nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 259\n+0\t121nt, >MISEQ-1639_50_00000... *\n+1\t121nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 260\n+0\t121nt, >MISEQ-1639_50_00000... *\n+>Cluster 261\n+0\t121nt, >MISEQ-1639_78_00000... *\n+1\t121nt, >MISEQ-1639_78_00000... at +/100.00%\n+>Cluster 262\n+0\t117nt, >MISEQ-1639_50_00000... *\n+1\t117nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 263\n+0\t111nt, >MISEQ-1639_50_00000... *\n+1\t111nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 264\n+0\t110nt, >MISEQ-1639_50_00000... *\n+1\t102nt, >MISEQ-1639_50_00000... at +/91.18%\n+>Cluster 265\n+0\t106nt, >MISEQ-1639_50_00000... *\n+1\t106nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 266\n+0\t106nt, >MISEQ-1639_50_00000... *\n+1\t106nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 267\n+0\t105nt, >MISEQ-1639_50_00000... *\n+1\t105nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 268\n+0\t105nt, >MISEQ-1639_50_00000... *\n+1\t105nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 269\n+0\t104nt, >MISEQ-1639_50_00000... *\n+1\t104nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 270\n+0\t104nt, >MISEQ-1639_50_00000... *\n+1\t104nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 271\n+0\t104nt, >MISEQ-1639_50_00000... *\n+1\t104nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 272\n+0\t104nt, >MISEQ-1639_50_00000... *\n+1\t104nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 273\n+0\t103nt, >MISEQ-1639_50_00000... *\n+1\t103nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 274\n+0\t103nt, >MISEQ-1639_50_00000... *\n+1\t103nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 275\n+0\t103nt, >MISEQ-1639_50_00000... *\n+1\t103nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 276\n+0\t102nt, >MISEQ-1639_50_00000... *\n+1\t102nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 277\n+0\t102nt, >MISEQ-1639_50_00000... *\n+>Cluster 278\n+0\t101nt, >MISEQ-1639_50_00000... *\n+1\t101nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 279\n+0\t101nt, >MISEQ-1639_50_00000... *\n+1\t101nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 280\n+0\t101nt, >MISEQ-1639_50_00000... *\n+1\t101nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 281\n+0\t100nt, >MISEQ-1639_50_00000... *\n+>Cluster 282\n+0\t97nt, >MISEQ-1639_50_00000... *\n+1\t97nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 283\n+0\t96nt, >MISEQ-1639_50_00000... *\n+>Cluster 284\n+0\t95nt, >MISEQ-1639_50_00000... *\n+1\t95nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 285\n+0\t92nt, >MISEQ-1639_50_00000... *\n+1\t92nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 286\n+0\t80nt, >MISEQ-1639_50_00000... *\n+1\t80nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 287\n+0\t75nt, >MISEQ-1639_50_00000... *\n+1\t75nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 288\n+0\t69nt, >MISEQ-1639_50_00000... *\n+1\t63nt, >MISEQ-1639_50_00000... at +/93.65%\n+2\t69nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 289\n+0\t66nt, >MISEQ-1639_50_00000... *\n+1\t66nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 290\n+0\t64nt, >MISEQ-1639_50_00000... *\n+1\t64nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 291\n+0\t63nt, >MISEQ-1639_50_00000... *\n+>Cluster 292\n+0\t62nt, >MISEQ-1639_50_00000... *\n+1\t62nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 293\n+0\t55nt, >MISEQ-1639_50_00000... *\n+1\t55nt, >MISEQ-1639_50_00000... at +/96.36%\n+>Cluster 294\n+0\t55nt, >MISEQ-1639_50_00000... *\n+1\t55nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 295\n+0\t55nt, >MISEQ-1639_50_00000... *\n+1\t55nt, >MISEQ-1639_50_00000... at +/100.00%\n+>Cluster 296\n+0\t55nt, >MISEQ-1639_50_00000... *\n+1\t53nt, >MISEQ-1639_50_00000... at +/90.57%\n+>Cluster 297\n+0\t55nt, >MISEQ-1639_50_00000... *\n+>Cluster 298\n+0\t53nt, >MISEQ-1639_50_00000... *\n'
b
diff -r 000000000000 -r e0da3400ac2f test-data/est_clusters_output.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/est_clusters_output.txt Mon Oct 15 10:54:50 2018 -0400
b
@@ -0,0 +1,39 @@
+>Cluster 0
+0 239nt, >F12Fcsw_481739... at +/99.16%
+1 243nt, >F14Fcsw_133982... *
+2 242nt, >F14Fcsw_149685... at +/99.59%
+3 230nt, >F14Fcsw_175165... at +/96.96%
+4 239nt, >F14Fcsw_176364... at +/97.91%
+5 239nt, >F14Fcsw_224425... at +/99.16%
+6 240nt, >F14Fcsw_27361... at +/99.58%
+7 239nt, >F14Fcsw_2745... at +/99.58%
+8 238nt, >F14Fcsw_37069... at +/99.58%
+9 238nt, >F14Fcsw_38031... at +/99.16%
+10 239nt, >F14Fcsw_49588... at +/99.16%
+11 230nt, >F23Fcsw_160873... at +/96.52%
+12 183nt, >F31Fcsw_135439... at +/95.63%
+13 241nt, >F34Fcsw_50866... at +/91.29%
+14 230nt, >M12Fcsw_69587... at +/92.61%
+15 240nt, >M13Fcsw_127764... at +/97.92%
+16 222nt, >M13Fcsw_198303... at +/96.40%
+17 227nt, >M14Fcsw_117325... at +/96.92%
+18 241nt, >M14Fcsw_151062... at +/99.59%
+19 239nt, >M14Fcsw_181677... at +/97.07%
+20 240nt, >M14Fcsw_186607... at +/99.17%
+21 239nt, >M24Fcsw_136217... at +/94.56%
+22 239nt, >M41Fcsw_259146... at +/97.91%
+23 210nt, >M42Fcsw_137216... at +/99.05%
+24 239nt, >M42Fcsw_138199... at +/99.16%
+25 208nt, >M42Fcsw_263016... at +/98.56%
+>Cluster 1
+0 238nt, >F22Fcsw_400293... at +/91.18%
+1 243nt, >F23Fcsw_133990... *
+2 243nt, >F23Fcsw_86009... at +/90.95%
+3 205nt, >F23Fcsw_96640... at +/91.71%
+4 210nt, >F32Fcsw_322472... at +/90.95%
+5 242nt, >F33Fcsw_137774... at +/90.91%
+6 234nt, >M13Fcsw_128004... at +/90.17%
+7 218nt, >M42Fcsw_225418... at +/90.83%
+8 193nt, >M42Fcsw_334979... at +/90.16%
+9 216nt, >M43Fcsw_250770... at +/90.28%
+10 241nt, >M44Fcsw_200453... at +/90.04%
b
diff -r 000000000000 -r e0da3400ac2f test-data/est_fasta_output.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/est_fasta_output.fasta Mon Oct 15 10:54:50 2018 -0400
b
@@ -0,0 +1,4 @@
+>F14Fcsw_133982
+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGCCCAA
+>F23Fcsw_133990
+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCGACAACACTGGGATAGCCTTTCGAAAGAAAGATTAATACCGGATGGCATAGTTTTCCCGCATGGAAAAACTATTAAAGAATTTCGGTTATCGATGGGGATGCGTTCCATTAGGCAGTTGGCGGGGTAACGGCCCACCAAACCGACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA
b
diff -r 000000000000 -r e0da3400ac2f test-data/protein_clusters_output.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/protein_clusters_output.txt Mon Oct 15 10:54:50 2018 -0400
b
@@ -0,0 +1,12 @@
+>Cluster 0
+0 375aa, >sp|P00325|ADH1B_HUM... *
+1 375aa, >tr|K7D361|K7D361_PA... at 99.73%
+>Cluster 1
+0 375aa, >sp|P00329|ADH1_MOUS... *
+>Cluster 2
+0 332aa, >sp|P00340|LDHA_CHIC... *
+>Cluster 3
+0 241aa, >sp|P00338-5|LDHA_HU... at 91.29%
+1 332aa, >sp|P19858|LDHA_BOVI... *
+>Cluster 4
+0 274aa, >sp|P00338-2|LDHA_HU... *
b
diff -r 000000000000 -r e0da3400ac2f test-data/protein_clusters_output_local.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/protein_clusters_output_local.txt Mon Oct 15 10:54:50 2018 -0400
b
@@ -0,0 +1,10 @@
+>Cluster 0
+0 375aa, >sp|P00325|ADH1B_HUMAN... *
+1 375aa, >tr|K7D361|K7D361_PANTR... at 1:375:1:375/99.73%
+2 375aa, >sp|P00329|ADH1_MOUSE... at 1:375:1:375/83.47%
+>Cluster 1
+0 241aa, >sp|P00338-5|LDHA_HUMAN... at 3:239:2:239/85.71%
+1 332aa, >sp|P00340|LDHA_CHICK... *
+2 332aa, >sp|P19858|LDHA_BOVIN... at 3:332:2:332/84.59%
+>Cluster 2
+0 274aa, >sp|P00338-2|LDHA_HUMAN... *
b
diff -r 000000000000 -r e0da3400ac2f test-data/protein_fasta_output.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/protein_fasta_output.fasta Mon Oct 15 10:54:50 2018 -0400
b
@@ -0,0 +1,36 @@
+>sp|P00325|ADH1B_HUMAN Alcohol dehydrogenase 1B OS=Homo sapiens GN=ADH1B PE=1 SV=2
+MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT
+PLPVILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP
+RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG
+YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ
+NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF
+DLLHSGKSIRTVLTF
+>sp|P00329|ADH1_MOUSE Alcohol dehydrogenase 1 OS=Mus musculus GN=Adh1 PE=2 SV=2
+MSTAGKVIKCKAAVLWELHKPFTIEDIEVAPPKAHEVRIKMVATGVCRSDDHVVSGTLVT
+PLPAVLGHEGAGIVESVGEGVTCVKPGDKVIPLFSPQCGECRICKHPESNFCSRSDLLMP
+RGTLREGTSRFSCKGKQIHNFISTSTFSQYTVVDDIAVAKIDGASPLDKVCLIGCGFSTG
+YGSAVKVAKVTPGSTCAVFGLGGVGLSVIIGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYSKPIQEVLQEMTDGGVDFSFEVIGRLDTMTSALLSCHAACGVSVVVGVPPNAQ
+NLSMNPMLLLLGRTWKGAIFGGFKSKDSVPKLVADFMAKKFPLDPLITHVLPFEKINEAF
+DLLRSGKSIRTVLTF
+>sp|P00338-2|LDHA_HUMAN Isoform 2 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA
+MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKECRYTLGDPKGA
+AILKSSDVISFHCLGYNRILGGGCACCPFYLICD
+>sp|P00340|LDHA_CHICK L-lactate dehydrogenase A chain OS=Gallus gallus GN=LDHA PE=1 SV=3
+MSLKDHLIHNVHKEEHAHAHNKISVVGVGAVGMACAISILMKDLADELTLVDVVEDKLKG
+EMLDLQHGSLFLKTPKIISGKDYSVTAHSKLVIVTAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPDCKLLIVSNPVDILTYVAWKISGFPKHRVIGSGCNLDSARFRHLMGERLGI
+HPLSCHGWIVGEHGDSSVPVWSGVNVAGVSLKALHPDMGTDADKEHWKEVHKQVVDSAYE
+VIKLKGYTSWAIGLSVADLAETIMKNLRRVHPISTAVKGMHGIKDDVFLSVPCVLGSSGI
+TDVVKMILKPDEEEKIKKSADTLWGIQKELQF
+>sp|P19858|LDHA_BOVIN L-lactate dehydrogenase A chain OS=Bos taurus GN=LDHA PE=2 SV=2
+MATLKDQLIQNLLKEEHVPQNKITIVGVGAVGMACAISILMKDLADEVALVDVMEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSRLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNIVKYSPNCKLLVVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWILGEHGDSSVPVWSGVNVAGVSLKNLHPELGTDADKEQWKAVHKQVVDSAYE
+VIKLKGYTSWAIGLSVADLAESIMKNLRRVHPISTMIKGLYGIKEDVFLSVPCILGQNGI
+SDVVKVTLTHEEEACLKKSADTLWGIQKELQF
b
diff -r 000000000000 -r e0da3400ac2f test-data/protein_fasta_output_local.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/protein_fasta_output_local.fasta Mon Oct 15 10:54:50 2018 -0400
b
@@ -0,0 +1,21 @@
+>sp|P00325|ADH1B_HUMAN Alcohol dehydrogenase 1B OS=Homo sapiens GN=ADH1B PE=1 SV=2
+MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT
+PLPVILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP
+RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG
+YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ
+NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF
+DLLHSGKSIRTVLTF
+>sp|P00338-2|LDHA_HUMAN Isoform 2 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA
+MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKECRYTLGDPKGA
+AILKSSDVISFHCLGYNRILGGGCACCPFYLICD
+>sp|P00340|LDHA_CHICK L-lactate dehydrogenase A chain OS=Gallus gallus GN=LDHA PE=1 SV=3
+MSLKDHLIHNVHKEEHAHAHNKISVVGVGAVGMACAISILMKDLADELTLVDVVEDKLKG
+EMLDLQHGSLFLKTPKIISGKDYSVTAHSKLVIVTAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPDCKLLIVSNPVDILTYVAWKISGFPKHRVIGSGCNLDSARFRHLMGERLGI
+HPLSCHGWIVGEHGDSSVPVWSGVNVAGVSLKALHPDMGTDADKEHWKEVHKQVVDSAYE
+VIKLKGYTSWAIGLSVADLAETIMKNLRRVHPISTAVKGMHGIKDDVFLSVPCVLGSSGI
+TDVVKMILKPDEEEKIKKSADTLWGIQKELQF