Repository 'cdhit'
hg clone https://toolshed.g2.bx.psu.edu/repos/bebatut/cdhit

Changeset 0:54d811ad2b52 (2016-04-25)
Next changeset 1:93f25d52cfa5 (2018-05-04)
Commit message:
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/cdhit/ commit ea0424ae045ac797c080aeabab9a8536f7eb2f84-dirty
added:
README.rst
cd_hit_est.xml
cd_hit_protein.xml
cdhit_macros.xml
test-data/cd_hit_est_in.fa
test-data/cd_hit_protein_in.fasta
test-data/est_clusters_output.txt
test-data/est_fasta_output.fasta
test-data/protein_clusters_output.txt
test-data/protein_fasta_output.fasta
tool_dependencies.xml
b
diff -r 000000000000 -r 54d811ad2b52 README.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst Mon Apr 25 12:15:23 2016 -0400
b
@@ -0,0 +1,12 @@
+CD_HIT memory usage
+====================
+
+By default, maximum of 4Gb is attributed to CD_HIT.
+
+To change the maximum memory usage, you can edit the CD_HIT_MEM_OPTIONS in the file:
+
+<tool_dependency_dir>/cd-hit/4.6.4/bebatut/cdhit/<hash_string>/env.sh
+
+For example to increase to 8Gb, you will write:
+
+CD_HIT_MEM_OPTIONS='-M 8000'
\ No newline at end of file
b
diff -r 000000000000 -r 54d811ad2b52 cd_hit_est.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cd_hit_est.xml Mon Apr 25 12:15:23 2016 -0400
[
@@ -0,0 +1,92 @@
+<tool id="cd_hit_est" name="CD-HIT-EST" version="1.3">
+  <description>Cluster a nucleotide dataset into representative sequences</description>
+
+  <macros>
+    <import>cdhit_macros.xml</import>
+  </macros>
+
+  <expand macro="requirements"/>
+
+  <command><![CDATA[
+    cd-hit-est
+      -i "$fasta_in" 
+      -o rep_seq 
+      -c $similarity 
+      -n $wordsize $strand
+   
+      #include source=$common_cdhit_options#
+  ]]></command>
+  
+  <inputs>
+    <param name="fasta_in" type="data" format="fasta" label="EST Sequences to cluster" help="(-i)"/>
+
+    <param name="similarity" type="float" value="0.9"  label="Similarity threshold" min=".75" max="1.0" help="(-c)"/>
+
+    <param name="wordsize" type="integer" value="8" label="Word size" min="4" max="10" help="It is suggested to adjust word size in function of similarity threshold. 8,9 or 10  for threshold in [0.9;1.0] interval, 7 for [0.88;0.9], 6 for [0.85;0.88], 5 for [0.80;0.85], 4 for [0.75;0.8] (-n)"/>
+
+    <param name="strand" type="boolean" truevalue="-r 1" falsevalue="" checked="false" label="Compare both strands?"/>
+
+    <expand macro="common_cdhit_options" />
+  </inputs>
+
+  <outputs>
+    <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: Clusters" from_work_dir="rep_seq.clstr"/>
+
+    <data format="fasta" name="fasta_out" label="${tool.name} on ${on_string}: Representative sequences" from_work_dir="rep_seq"/>
+  </outputs>
+  
+  <tests>
+      <test>
+        <param name="fasta_in" value="cd_hit_est_in.fa" />
+        <param name="similarity" value="0.9"/>
+        <param name="wordsize" value="8"/>
+        <param name="strand" value="false"/>
+        <param name="settings" value="no"/>
+        <param name="tuning" value="default"/>
+        <output name="clusters_out" file="est_clusters_output.txt"/>
+        <output name="fasta_out" file="est_fasta_output.fasta"/>
+      </test>
+  </tests>
+
+  <help><![CDATA[
+
+**What it does**
+
+CD-HIT_ stands for Cluster Database at High Identity with Tolerance. The program (cd-hit) takes a fasta format sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the sequence 'groupies' for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit produces a set of closely related protein families from a given fasta sequence database.
+
+.. _CD-HIT: http://www.bioinformatics.org/cd-hit/
+
+------
+
+**Inputs**
+
+cd-hit-est requires a fasta file as input. 
+
+------
+
+**Outputs**
+
+The first output is a fasta file containing representative sequences.
+
+The second output is a text file listing the mapping of sequences to the representative sequences::
+
+ >Cluster 0
+ 0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... *
+ >Cluster 1
+ 0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80%
+ 1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84%
+ 2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... *
+ 3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84%
+ 4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63%
+ >Cluster 2
+ 0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60%
+ 1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... *
+ 2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73%
+ 3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69%
+  ]]></help>
+
+  <citations>
+    <citation type="doi">10.1093/bioinformatics/btl158</citation>
+    <citation type="doi">10.1093/bioinformatics/bts565</citation>
+  </citations>
+</tool>
b
diff -r 000000000000 -r 54d811ad2b52 cd_hit_protein.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cd_hit_protein.xml Mon Apr 25 12:15:23 2016 -0400
[
@@ -0,0 +1,88 @@
+<tool id="cd_hit_protein" name="CD-HIT PROTEIN" version="1.3">
+  <description>Cluster a protein dataset into representative sequences</description>
+    <macros>
+      <import>cdhit_macros.xml</import>
+    </macros>
+
+    <expand macro="requirements"/>
+
+    <command><![CDATA[
+      cd-hit 
+        -i "$fasta_in" 
+        -o rep_seq 
+        -c $similarity 
+        -n $wordsize 
+
+        #include source=$common_cdhit_options#
+    ]]></command>
+
+    <inputs>
+      <param name="fasta_in" type="data" format="fasta" label="Protein Sequences to cluster" help="(-i)"/>
+
+      <param name="similarity" type="float" value="0.9"  label="Similarity threshold" min=".4" max="1.0" help="Similarity threshold must be in [0.4;1.0] interval (-c)"/>
+
+      <param name="wordsize" type="integer" value="5"  label="Word size" min="2" max="5" help="It is suggested to adjust word size in function of similarity threshold. 5 for threshold in [0.7;1.0] interval, 4 for [0.6;0.7], 3 for [0.5;0.6], 2 for [0.4;0.5] (-n)"/>
+
+      <expand macro="common_cdhit_options" />
+    </inputs>
+
+    <outputs>
+      <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: Clusters" from_work_dir="rep_seq.clstr"/>
+      <data format="fasta" name="fasta_out" label="${tool.name} on ${on_string}: Representatives sequences" from_work_dir="rep_seq"/>
+    </outputs>
+
+    <tests>
+      <test>
+        <param name="fasta_in" value="cd_hit_protein_in.fasta" />
+        <param name="similarity" value="0.9" />
+        <param name="wordsize" value="5" />
+        <param name="settings" value="no"/>
+        <param name="print_alignment" value="false"/>
+        <param name="cluster_type" value="false"/>
+        <param name="tuning" value="default"/>
+        <output name="clusters_out" file="protein_clusters_output.txt"/>
+        <output name="fasta_out" file="protein_fasta_output.fasta"/>
+      </test>
+    </tests>
+
+    <help><![CDATA[
+**What it does**
+
+CD-HIT_ stands for Cluster Database at High Identity with Tolerance. The program (cd-hit) takes a fasta format sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the sequence 'groupies' for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit produces a set of closely related protein families from a given fasta sequence database.
+
+.. _CD-HIT: http://www.bioinformatics.org/cd-hit/
+
+------
+
+**Inputs**
+
+cd-hit requires a protein fasta file as input.
+
+------
+
+**Outputs**
+
+The first output is a fasta file containing representative sequences.
+
+The second output is a text file listing the mapping of sequences to the representative sequences::
+
+  >Cluster 0
+  0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... *
+  >Cluster 1
+  0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80%
+  1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84%
+  2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... *
+  3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84%
+  4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63%
+  >Cluster 2
+  0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60%
+  1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... *
+  2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73%
+  3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69%
+    ]]></help>
+
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btl158</citation>
+        <citation type="doi">10.1093/bioinformatics/bts565</citation>
+    </citations>
+</tool>
b
diff -r 000000000000 -r 54d811ad2b52 cdhit_macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cdhit_macros.xml Mon Apr 25 12:15:23 2016 -0400
b
@@ -0,0 +1,116 @@
+<macros>
+  <xml name="requirements">
+      <requirements>
+        <requirement type="package" version="4.6.4">cd-hit</requirement>
+      </requirements>
+    </xml>
+
+  <template name="common_cdhit_options">
+    #if $advanced.settings == 'yes':
+
+      #if str($advanced.band_width) != 'None':
+        -b $advanced.band_width
+      #end if
+
+      #if str($advanced.throw_away_len) != 'None':
+        -l $advanced.throw_away_len
+      #end if
+
+      #if str($advanced.description_len) != 'None':
+        -d $advanced.description_len
+      #end if
+
+      #if str($advanced.cutoff_diff_len) != 'None':
+        -s $advanced.cutoff_diff_len
+      #end if
+
+      #if str($advanced.aa_cutoff_diff_len) != 'None':
+        -S $advanced.aa_cutoff_diff_len
+      #end if
+
+      #if $advanced.align.style == 'local':
+        -G 0
+        #if str($advance.align.align_coverage_long) != 'None':
+          -aL $advance.align.align_coverage_long
+        #end if
+
+        #if str($advance.align.align_coverage_long_control) != 'None':
+          -AL $advance.align.align_coverage_long_control
+        #end if
+
+        #if str($advance.aling.align_coverage_short) != 'None':
+          -aS $advance.align.align_coverage_short
+        #end if
+
+        #if str($advance.aling.align_coverage_short_control) != 'None':
+          -AS $advance.align.align_coverage_short_control
+        #end if
+
+        #if str($advance.align.align_coverage_min) != 'None':
+          -A $advance.align.aling_coverage_min
+        #end if
+      #end if
+    #end if 
+
+    #if $print_alignment:
+      $print_alignment
+    #end if
+
+    #if $cluster_type:
+      $cluster_type
+    #end if
+
+    \$CDHIT_SITE_OPTIONS
+
+    -T \${GALAXY_SLOTS:-1}
+  </template>
+
+  <macro name="common_cdhit_options">
+    <conditional name="advanced">
+      <param name="settings" type="select" label="Use advanced settings?">
+        <option value="no" selected="true">No</option>
+        <option value="yes">Yes</option>
+      </param>
+
+      <when value="no"/>
+
+      <when value="yes">
+        <param name="band_width" type="integer" value="20" optional="true" label="Alignment band width" min="1" help="(-b)"/>
+
+        <param name="throw_away_len" type="integer" value="10" optional="true" label="Length of throw away sequences" min="1" help="(-l)"/>
+
+        <param name="description_len" type="integer" value="20" optional="true" label="Length of the description" min="0" help="If set to 0, it takes the fasta defline and stops at first space (-d)"/>
+
+        <param name="cutoff_diff_len" type="float" value="0.0" optional="true" label="Length difference cutoff" min="0.0" max="1.0" help="If set to 0.9, the shorter sequences need to be at least 90% length of the representative of the cluster (-s)"/>
+
+        <param name="aa_cutoff_diff_len" type="integer" value="999999" optional="true" label="Length difference cutoff in amino acid" min="0" help="If set to 60, the length difference between the shorter sequences and the representative of the cluster can not be bigger than 60 (-S)"/>
+
+        <conditional name="align">
+          <param name="style" type="select" label="global or local alignments" help="Local sequence identity, calculated as : number of identical amino acids in alignment divided by the length of the alignment. You must set alignment coverage by length or fraction.">
+            <option value="global" selected="true">Global</option>
+            <option value="local" >Local</option>
+          </param>
+
+          <when value="global"/>
+
+          <when value="local">
+              <param name="align_coverage_long" type="float" value="0.0" optional="true" label="Alignment coverage for the longer sequence" min="0.0" max="1.0" help="If set to 0.9, the alignment must covers 90% of the sequence (-aL)"/>
+
+              <param name="align_coverage_long_control" type="integer" value="99999999" optional="true" label="Alignment coverage control for the longer sequence " min="0" help="If set to 60, and the sequence's length 400,then the alignment must be at least 340 (400-60) residues (-AL)"/>
+     
+              <param name="align_coverage_short" type="float" value="0.0" optional="true" label="Alignment coverage for the shorter sequence" min="0.0" max="1.0" help="As for the longer (-aS)"/>
+     
+              <param name="align_coverage_short_control" type="integer" value="99999999" optional="true" label="Alignment coverage control for the shorter sequence" min="0" help="As for the longer (-AS)"/>
+     
+              <param name="align_coverage_min" type="integer" value="0" optional="true" label="Minimal alignment coverage control for the both sequences" min="0" help="Alignment must cover at least this value for both sequences (-A)"/>
+          </when>
+        </conditional>
+      </when>
+    </conditional>
+
+    <param name="print_alignment" type="boolean" truevalue="-p 1" falsevalue="" checked="false" label="Print alignment overlap in .clstr file"/>
+
+    <param name="cluster_type" type="boolean" truevalue="-g 1" falsevalue="" checked="false" label="Slow Cluster" help="In cd-hit's default algorithm, a sequence is clustered to the first cluster that meet the threshold (fast cluster). If set to slow, the program will cluster it into the most similar cluster that meet the threshold (accurate but slow mode). This won't change the representatives of final clusters (-g)"/>
+  </macro>
+
+</macros>
b
diff -r 000000000000 -r 54d811ad2b52 test-data/cd_hit_est_in.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cd_hit_est_in.fa Mon Apr 25 12:15:23 2016 -0400
b
b'@@ -0,0 +1,74 @@\n+>F12Fcsw_481739\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGACGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F14Fcsw_133982\n+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGCCCAA\n+>F14Fcsw_149685\n+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F14Fcsw_175165\n+CGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGATAGTAGGCGGGGTAACGGCCCACCTAGTCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F14Fcsw_176364\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGCCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACGCGGTCCAA\n+>F14Fcsw_224425\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTAAGACACGGTCCAA\n+>F14Fcsw_27361\n+CGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F14Fcsw_2745\n+GACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F14Fcsw_37069\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGCCCAG\n+>F14Fcsw_38031\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCAA\n+>F14Fcsw_49588\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACGCGGTCCAA\n+>F22Fcsw_400293\n+CCGGCGCACGGGTGAGTAACACGTATCCAACCTTCCGTACACTCAGGGATAGCCTTTCGAAAGAAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGGATTTCGGTCATCGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGGCAACGATCAGTAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F23Fcsw_133990\n+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCGACAACACTGGGATAGCCTTTCGAAAGAAAGATTAATACCGGATGGCATAGTTTTCCCGCATGGAAAAACTATTAAAGAATTTCGGTTATCGATGGGGATGCGTTCCATTAGGCAGTTGGCGGGGTAACGGCCCACCAAACCGACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F23Fcsw_160873\n+CGGGTGAGTAACGCGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGGTAGTAGGCGGGGTAACGGCCCACCTAGCCAACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F23Fcsw_86009\n+GGCGACCGGCGCACGGGTGAGTAACGCGTATCCAACCTTCCGTACACTCAGGGATAGCCTTTCGAAAGAAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGGATTTCGGTCATCGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGGCAACGATCAGTAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>F23Fcsw_96640\n+TCCGTACACTCAGGGATAGCCTTTCGAAAGAAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGAATTTCGGTCATCGATGGGGATGCGT'..b'GGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M13Fcsw_127764\n+GACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACGACGATGCGTAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGTACTGAGACACGGACCAA\n+>M13Fcsw_128004\n+CGGACGGGTGAGTAACGCGTGAGTAACCTGCCGATAACTCAGGGATAGCCTTTCGAAAGAAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGAATTTCGGTCATCGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGGCGACGATGCGTAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGTACTGAGACACGGACCAA\n+>M13Fcsw_198303\n+TAACACGTATCCAACCTGCCTCATACTCGGGGATAACCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGATAGTAGGCGGGGTAACGGCCCACCTAGTCAACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M14Fcsw_117325\n+GTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGATAGTAGGCGGGGTAACGGCCCACCTAGTCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M14Fcsw_151062\n+CGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M14Fcsw_181677\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGATAGTAGGCGGGGTAACGGCCCACCTAGTCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M14Fcsw_186607\n+GACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGGTTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M24Fcsw_136217\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTCTGATTAGCTTGTTGGCGGGGTAACGGCCCACCAAGGCACCGATCAGTAGGGGTTCTGAGAGGAAGGTCCCCCACATAGGAACTGAGACACGGTCCTA\n+>M41Fcsw_259146\n+ACCGGCGCACGGGTGAGTAACACGTATCCAACCTACCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGCCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M42Fcsw_137216\n+CAACCTACCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M42Fcsw_138199\n+GACCGGCGCACGGGTGAGTAACACGTATCCAACCTACCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M42Fcsw_225418\n+CACGTATCCAACCTGCCGTCTACTCTTGGACAGCCTTCTGAAAGGAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGGATTTCGGTCATCGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGCCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCACATTGGAACTGAGACACGGTCCAA\n+>M42Fcsw_263016\n+ACCTACCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCCA\n+>M42Fcsw_334979\n+GGGATAGCCGCCCGAAAGGACGGGTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGGATTTCGGTCATCGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCAAGCCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M43Fcsw_250770\n+GTATCCAACCTGCCGTCTACTCTTGGACAGCCTTCTGAAAGAAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGGATTTCGGTCATCGATGGGGATGCGTTCCATTAGGTTGTTGGCGGGGTAACGGCCCACCTAGTCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n+>M44Fcsw_200453\n+CTAGTGGCGGACGGGTGAGTAACGCGTATCCAACCTGCCGATGACTCGGGGATAGCCTTTCGAAAGAAAGATTAATACCCGATGGCATAGTTCTTCCGCATGGTAGAACTATTAAAGAACTTCGGTCATCGATGGGGATGCGTTCCATTAGATAGTAGGCGGGGTAACGGCCCACCTAGTCTTCGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA\n'
b
diff -r 000000000000 -r 54d811ad2b52 test-data/cd_hit_protein_in.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cd_hit_protein_in.fasta Mon Apr 25 12:15:23 2016 -0400
b
@@ -0,0 +1,50 @@
+>sp|P00325|ADH1B_HUMAN Alcohol dehydrogenase 1B OS=Homo sapiens GN=ADH1B PE=1 SV=2
+MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT
+PLPVILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP
+RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG
+YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ
+NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF
+DLLHSGKSIRTVLTF
+>tr|K7D361|K7D361_PANTR Alcohol dehydrogenase 1B (Class I), beta polypeptide OS=Pan troglodytes GN=ADH1B PE=2 SV=1
+MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT
+PLPAILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP
+RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG
+YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ
+NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF
+DLLHSGKSIRTVLTF
+>sp|P00329|ADH1_MOUSE Alcohol dehydrogenase 1 OS=Mus musculus GN=Adh1 PE=2 SV=2
+MSTAGKVIKCKAAVLWELHKPFTIEDIEVAPPKAHEVRIKMVATGVCRSDDHVVSGTLVT
+PLPAVLGHEGAGIVESVGEGVTCVKPGDKVIPLFSPQCGECRICKHPESNFCSRSDLLMP
+RGTLREGTSRFSCKGKQIHNFISTSTFSQYTVVDDIAVAKIDGASPLDKVCLIGCGFSTG
+YGSAVKVAKVTPGSTCAVFGLGGVGLSVIIGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYSKPIQEVLQEMTDGGVDFSFEVIGRLDTMTSALLSCHAACGVSVVVGVPPNAQ
+NLSMNPMLLLLGRTWKGAIFGGFKSKDSVPKLVADFMAKKFPLDPLITHVLPFEKINEAF
+DLLRSGKSIRTVLTF
+>sp|P00338-2|LDHA_HUMAN Isoform 2 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA
+MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKECRYTLGDPKGA
+AILKSSDVISFHCLGYNRILGGGCACCPFYLICD
+>sp|P00338-5|LDHA_HUMAN Isoform 5 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA
+MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKEVHKQVVERVFT
+E
+>sp|P00340|LDHA_CHICK L-lactate dehydrogenase A chain OS=Gallus gallus GN=LDHA PE=1 SV=3
+MSLKDHLIHNVHKEEHAHAHNKISVVGVGAVGMACAISILMKDLADELTLVDVVEDKLKG
+EMLDLQHGSLFLKTPKIISGKDYSVTAHSKLVIVTAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPDCKLLIVSNPVDILTYVAWKISGFPKHRVIGSGCNLDSARFRHLMGERLGI
+HPLSCHGWIVGEHGDSSVPVWSGVNVAGVSLKALHPDMGTDADKEHWKEVHKQVVDSAYE
+VIKLKGYTSWAIGLSVADLAETIMKNLRRVHPISTAVKGMHGIKDDVFLSVPCVLGSSGI
+TDVVKMILKPDEEEKIKKSADTLWGIQKELQF
+>sp|P19858|LDHA_BOVIN L-lactate dehydrogenase A chain OS=Bos taurus GN=LDHA PE=2 SV=2
+MATLKDQLIQNLLKEEHVPQNKITIVGVGAVGMACAISILMKDLADEVALVDVMEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSRLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNIVKYSPNCKLLVVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWILGEHGDSSVPVWSGVNVAGVSLKNLHPELGTDADKEQWKAVHKQVVDSAYE
+VIKLKGYTSWAIGLSVADLAESIMKNLRRVHPISTMIKGLYGIKEDVFLSVPCILGQNGI
+SDVVKVTLTHEEEACLKKSADTLWGIQKELQF
b
diff -r 000000000000 -r 54d811ad2b52 test-data/est_clusters_output.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/est_clusters_output.txt Mon Apr 25 12:15:23 2016 -0400
b
@@ -0,0 +1,39 @@
+>Cluster 0
+0 239nt, >F12Fcsw_481739... at +/99.16%
+1 243nt, >F14Fcsw_133982... *
+2 242nt, >F14Fcsw_149685... at +/99.59%
+3 230nt, >F14Fcsw_175165... at +/96.96%
+4 239nt, >F14Fcsw_176364... at +/97.91%
+5 239nt, >F14Fcsw_224425... at +/99.16%
+6 240nt, >F14Fcsw_27361... at +/99.58%
+7 239nt, >F14Fcsw_2745... at +/99.58%
+8 238nt, >F14Fcsw_37069... at +/99.58%
+9 238nt, >F14Fcsw_38031... at +/99.16%
+10 239nt, >F14Fcsw_49588... at +/99.16%
+11 230nt, >F23Fcsw_160873... at +/96.52%
+12 183nt, >F31Fcsw_135439... at +/95.63%
+13 241nt, >F34Fcsw_50866... at +/91.29%
+14 230nt, >M12Fcsw_69587... at +/92.61%
+15 240nt, >M13Fcsw_127764... at +/97.92%
+16 222nt, >M13Fcsw_198303... at +/96.40%
+17 227nt, >M14Fcsw_117325... at +/96.92%
+18 241nt, >M14Fcsw_151062... at +/99.59%
+19 239nt, >M14Fcsw_181677... at +/97.07%
+20 240nt, >M14Fcsw_186607... at +/99.17%
+21 239nt, >M24Fcsw_136217... at +/94.56%
+22 239nt, >M41Fcsw_259146... at +/97.91%
+23 210nt, >M42Fcsw_137216... at +/99.05%
+24 239nt, >M42Fcsw_138199... at +/99.16%
+25 208nt, >M42Fcsw_263016... at +/98.56%
+>Cluster 1
+0 238nt, >F22Fcsw_400293... at +/91.18%
+1 243nt, >F23Fcsw_133990... *
+2 243nt, >F23Fcsw_86009... at +/90.95%
+3 205nt, >F23Fcsw_96640... at +/91.71%
+4 210nt, >F32Fcsw_322472... at +/90.95%
+5 242nt, >F33Fcsw_137774... at +/90.91%
+6 234nt, >M13Fcsw_128004... at +/90.17%
+7 218nt, >M42Fcsw_225418... at +/90.83%
+8 193nt, >M42Fcsw_334979... at +/90.16%
+9 216nt, >M43Fcsw_250770... at +/90.28%
+10 241nt, >M44Fcsw_200453... at +/90.04%
b
diff -r 000000000000 -r 54d811ad2b52 test-data/est_fasta_output.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/est_fasta_output.fasta Mon Apr 25 12:15:23 2016 -0400
b
@@ -0,0 +1,4 @@
+>F14Fcsw_133982
+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGCCCAA
+>F23Fcsw_133990
+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCGACAACACTGGGATAGCCTTTCGAAAGAAAGATTAATACCGGATGGCATAGTTTTCCCGCATGGAAAAACTATTAAAGAATTTCGGTTATCGATGGGGATGCGTTCCATTAGGCAGTTGGCGGGGTAACGGCCCACCAAACCGACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA
b
diff -r 000000000000 -r 54d811ad2b52 test-data/protein_clusters_output.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/protein_clusters_output.txt Mon Apr 25 12:15:23 2016 -0400
b
@@ -0,0 +1,12 @@
+>Cluster 0
+0 375aa, >sp|P00325|ADH1B_HUM... *
+1 375aa, >tr|K7D361|K7D361_PA... at 99.73%
+>Cluster 1
+0 375aa, >sp|P00329|ADH1_MOUS... *
+>Cluster 2
+0 332aa, >sp|P00340|LDHA_CHIC... *
+>Cluster 3
+0 241aa, >sp|P00338-5|LDHA_HU... at 91.29%
+1 332aa, >sp|P19858|LDHA_BOVI... *
+>Cluster 4
+0 274aa, >sp|P00338-2|LDHA_HU... *
b
diff -r 000000000000 -r 54d811ad2b52 test-data/protein_fasta_output.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/protein_fasta_output.fasta Mon Apr 25 12:15:23 2016 -0400
b
@@ -0,0 +1,36 @@
+>sp|P00325|ADH1B_HUMAN Alcohol dehydrogenase 1B OS=Homo sapiens GN=ADH1B PE=1 SV=2
+MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT
+PLPVILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP
+RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG
+YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ
+NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF
+DLLHSGKSIRTVLTF
+>sp|P00329|ADH1_MOUSE Alcohol dehydrogenase 1 OS=Mus musculus GN=Adh1 PE=2 SV=2
+MSTAGKVIKCKAAVLWELHKPFTIEDIEVAPPKAHEVRIKMVATGVCRSDDHVVSGTLVT
+PLPAVLGHEGAGIVESVGEGVTCVKPGDKVIPLFSPQCGECRICKHPESNFCSRSDLLMP
+RGTLREGTSRFSCKGKQIHNFISTSTFSQYTVVDDIAVAKIDGASPLDKVCLIGCGFSTG
+YGSAVKVAKVTPGSTCAVFGLGGVGLSVIIGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYSKPIQEVLQEMTDGGVDFSFEVIGRLDTMTSALLSCHAACGVSVVVGVPPNAQ
+NLSMNPMLLLLGRTWKGAIFGGFKSKDSVPKLVADFMAKKFPLDPLITHVLPFEKINEAF
+DLLRSGKSIRTVLTF
+>sp|P00338-2|LDHA_HUMAN Isoform 2 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA
+MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKECRYTLGDPKGA
+AILKSSDVISFHCLGYNRILGGGCACCPFYLICD
+>sp|P00340|LDHA_CHICK L-lactate dehydrogenase A chain OS=Gallus gallus GN=LDHA PE=1 SV=3
+MSLKDHLIHNVHKEEHAHAHNKISVVGVGAVGMACAISILMKDLADELTLVDVVEDKLKG
+EMLDLQHGSLFLKTPKIISGKDYSVTAHSKLVIVTAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPDCKLLIVSNPVDILTYVAWKISGFPKHRVIGSGCNLDSARFRHLMGERLGI
+HPLSCHGWIVGEHGDSSVPVWSGVNVAGVSLKALHPDMGTDADKEHWKEVHKQVVDSAYE
+VIKLKGYTSWAIGLSVADLAETIMKNLRRVHPISTAVKGMHGIKDDVFLSVPCVLGSSGI
+TDVVKMILKPDEEEKIKKSADTLWGIQKELQF
+>sp|P19858|LDHA_BOVIN L-lactate dehydrogenase A chain OS=Bos taurus GN=LDHA PE=2 SV=2
+MATLKDQLIQNLLKEEHVPQNKITIVGVGAVGMACAISILMKDLADEVALVDVMEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSRLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNIVKYSPNCKLLVVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWILGEHGDSSVPVWSGVNVAGVSLKNLHPELGTDADKEQWKAVHKQVVDSAYE
+VIKLKGYTSWAIGLSVADLAESIMKNLRRVHPISTMIKGLYGIKEDVFLSVPCILGQNGI
+SDVVKVTLTHEEEACLKKSADTLWGIQKELQF
b
diff -r 000000000000 -r 54d811ad2b52 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Mon Apr 25 12:15:23 2016 -0400
b
@@ -0,0 +1,35 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="cd-hit" version="4.6.4">
+        <install version="1.0">
+            <actions>
+                <action type="download_by_url" md5="d4baba1e95ba9c831a763c6b8a19ce21">https://github.com/weizhongli/cdhit/releases/download/V4.6.4/cd-hit-v4.6.4-2015-0603.tar.gz</action>
+                <action type="shell_command">make</action>
+                <action type="move_file">
+                     <source>cd-hit</source>
+                     <destination>$INSTALL_DIR</destination>
+                </action>
+                <action type="move_file">
+                      <source>cd-hit-est</source>
+                      <destination>$INSTALL_DIR</destination>
+                </action>
+                <action type="set_environment">
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR</environment_variable>
+                    <environment_variable name="CDHIT_SITE_OPTIONS" action="set_to">"-M 4000"</environment_variable>
+                </action>
+            </actions>
+        </install>
+        <readme>
+These links provide information for building the cdhit package:
+
+http://weizhong-lab.ucsd.edu/cd-hit/
+
+https://code.google.com/p/cdhit/
+
+https://code.google.com/p/cdhit/source/browse/README
+
+Change the CDHIT_SITE_OPTIONS variable in the installed env.sh file to adjust 
+the maximum memory Mb (-M).
+        </readme>
+    </package>
+</tool_dependency>
\ No newline at end of file