Repository 'khmer'
hg clone https://toolshed.g2.bx.psu.edu/repos/crusoe/khmer

Changeset 15:2c4635f5be47 (2015-09-18)
Previous changeset 14:3d90a3a78c3b (2015-09-12)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/khmer/ commit b9c1b77ffc1fc6341040bc00bee42459b1a4f66d-dirty
modified:
abundance-dist-single.xml
abundance-dist.xml
do-partition.xml
extract-partitions.xml
filter-abund.xml
filter-below-abund.xml
macros.xml
normalize-by-median.xml
repository_dependencies.xml
test-data/random-20-a.fa.part
added:
test-data/normalize-by-median.c2.report.txt
test-data/normalize-by-median.paired.report.txt
test-data/normalize-by-median.report.txt
test-data/random-20-a.fa.part.info
test-data/random-20-a.part.extract.fa
removed:
normalize-by-median.cwl
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 abundance-dist-single.xml
--- a/abundance-dist-single.xml Sat Sep 12 21:05:57 2015 -0400
+++ b/abundance-dist-single.xml Fri Sep 18 17:48:50 2015 -0400
[
@@ -1,6 +1,6 @@
 <tool id="gedlab-khmer-abundance-dist-single"
  name="Abundance Distribution (all-in-one)"
- version="2.0-4">
+ version="2.0-5">
 
  <description>
  Calculate abundance distribution of the k-mers in a given
@@ -14,20 +14,18 @@
  <expand macro="stdio" />
  <expand macro="version" />
  <command><![CDATA[
-## The command is a Cheetah template which allows some Python based syntax.
-## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
-mkdir output; cd output;
+mkdir output && cd output &&
 @BINARY@
 @TABLEPARAMS@
-$zero
-$bigcount
+${zero}
+${bigcount}
 #if $save_countgraph
---savegraph=$optional_output_countgraph
+--savegraph=${optional_output_countgraph}
 #end if
 --squash
 @THREADS@
-$input_sequence_filename
-$output_histogram_filename
+${input_sequence_filename}
+${output_histogram_filename}
 ]]>
  </command>
 
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 abundance-dist.xml
--- a/abundance-dist.xml Sat Sep 12 21:05:57 2015 -0400
+++ b/abundance-dist.xml Fri Sep 18 17:48:50 2015 -0400
[
@@ -1,6 +1,6 @@
 <tool id="gedlab-khmer-abundance-dist"
  name="Abundance Distribution"
- version="2.0-3">
+ version="2.0-4">
 
  <description>
  Calculate abundance distribution of the k-mers in a given sequence
@@ -14,16 +14,14 @@
  <expand macro="stdio" />
  <expand macro="version" />
  <command><![CDATA[
-## The command is a Cheetah template which allows some Python based syntax.
-## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
-mkdir output; cd output;
+mkdir output && cd output &&
 @BINARY@
 --squash
-$zero
-$bigcount
-$input_countgraph_filename
-$input_sequence_filename
-$output_histogram_filename
+${zero}
+${bigcount}
+${input_countgraph_filename}
+${input_sequence_filename}
+${output_histogram_filename}
 ]]>
  </command>
 
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 do-partition.xml
--- a/do-partition.xml Sat Sep 12 21:05:57 2015 -0400
+++ b/do-partition.xml Fri Sep 18 17:48:50 2015 -0400
[
@@ -1,65 +1,51 @@
 <tool id="gedlab-khmer-do-partition"
  name="Sequence partition all-in-one"
- version="2.0-1">
+ version="2.0-3">
 
  <description>
  Load, partition, and annotate FAST[AQ] sequences
  </description>
         <macros>
-                <token name="@BINARY@">do-parition.py</token>
+                <token name="@BINARY@">do-partition.py</token>
                 <import>macros.xml</import>
         </macros>
  <expand macro="requirements" />
  <expand macro="stdio" />
  <expand macro="version" />
  <command><![CDATA[
-mkdir -p output;
+set -xu &&
+#for $num, $input in enumerate($inputs)
+ln -s ${input} sequence-${num} &&
+#end for
+mkdir -p output && cd output &&
 @BINARY@
 @TABLEPARAMS@
 @THREADS@
 output
-#for input in $inputs
-$input
-#end for ;
-mv output.info $infomation ;
-mv *.part output/
+../sequence-*
+&&
+mv output.info $information ;
 ]]>
  </command>
 
  <inputs>
  <expand macro="input_sequences_filenames" />
- <param name="ksize"
- type="integer"
- value="20"
- label="ksize"
- help="k-mer size to use (--ksize/-k)" />
- <param name="n_tables"
- type="integer"
- min="1"
- value="4"
- label="n_tables"
- help="number of tables to use (--n_tables/-N)" />
- <param name="tablesize_specific"
- type="text"
- label="tablesize"
- help="lower bound on the tablesize to use (--min-tablesize/-x)" />
+ <expand macro="tableinputs" />
  </inputs>
  <outputs>
  <data name="information"
  format="txt"
  label="${tool.name} summary for #echo ','.join(map(str, $inputs ))#" />
- <expand macro="output_sequences" />
+ <collection name="annotated_sequence_files" type="list">
+ <discover_datasets pattern="__name__" directory="output" />
+ </collection>
  </outputs>
  <tests>
  <test>
                         <param name="inputs" value="random-20-a.fa"/>
- <output name="output">
- <discovered_dataset designation="random-20-a.fa.part">
- <assert_contents>
- <has_text text='>35     2' />
- </assert_contents>
- </discovered_dataset>
-                        </output>
+ <output_collection name="annotated_sequence_files" type="list">
+ <element name="sequence-0.part" file="random-20-a.fa.part" />
+                        </output_collection>
                 </test>
  </tests>
  <help><![CDATA[
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 extract-partitions.xml
--- a/extract-partitions.xml Sat Sep 12 21:05:57 2015 -0400
+++ b/extract-partitions.xml Fri Sep 18 17:48:50 2015 -0400
b
@@ -1,6 +1,6 @@
 <tool id="gedlab-khmer-extract-partitions"
  name="Extract partitions"
- version="2.0-1">
+ version="2.0-2">
 
  <description>
  Separate sequences that are annotated with partitions into
@@ -52,18 +52,17 @@
  <data name="distribution"
  format="txt"
  label="Partition size distribution from ${tool.name}" />
- <expand macro="output_sequences" />
+ <collection name="groups-of-partitions" type="list">
+ <discover_datasets pattern="__name_and_ext__" directory="output" />
+ </collection>
  </outputs>
 
  <tests>
  <test>
  <param name="inputs" value="random-20-a.fa.part"/>
- <output name="distribution">
- <assert_contents>
- <has_line_matching
- expression="90 1 3 98" />
- </assert_contents>
- </output>
+ <output_collection name="groups-of-partitions">
+ <element name="output.group0000" file="random-20-a.part.extract.fa" />
+ </output_collection>
  </test>
 
  </tests>
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 filter-abund.xml
--- a/filter-abund.xml Sat Sep 12 21:05:57 2015 -0400
+++ b/filter-abund.xml Fri Sep 18 17:48:50 2015 -0400
[
@@ -1,6 +1,6 @@
 <tool id="gedlab-khmer-filter-abund"
  name="Filter by abundance"
- version="2.0-4">
+ version="2.0-5">
 
  <description>
  Trims fastq/fasta sequences at k-mers of a given abundance
@@ -14,17 +14,17 @@
  <expand macro="stdio" />
  <expand macro="version" />
  <command><![CDATA[
-mkdir output; cd output;
+set -xu ;
+#for $num, $input in enumerate($inputs)
+ln -s ${input} filter-abund-sequence-${num} &&
+#end for
+mkdir output && cd output;
 @BINARY@
-#if $cutoff != 2
-  --cutoff=$cutoff
-#end if
-$variable_coverage
+--cutoff=${cutoff}
+${variable_coverage}
 @THREADS@
-$input_countgraph_filename
-#for input in $inputs
- $input
-#end for
+${input_countgraph_filename}
+../filter-abund-sequence*
 ]]>
  </command>
 
@@ -45,34 +45,36 @@
  <expand macro="input_countgraph_filename" />
  </inputs>
  <outputs>
- <expand macro="output_sequences" />
+ <collection name="filter-abund-sequences" type="list">
+ <discover_datasets pattern="__name__" directory="output" />
+ </collection>
  </outputs>
  <tests>
                 <test>
                         <param name="inputs" value="test-abund-read-2.fa" />
  <param name="input_countgraph_filename"
  value="test-abund-read-2.oxlicg" ftype="oxlicg" />
-                        <output name="output">
- <discover_dataset name="test-abund-read-2.fa.abundfilt"> 
+ <output_collection name="filter-abund-sequences" type="list">
+ <element name="filter-abund-sequence-0.abundfilt"> 
                                  <assert_contents>
                                          <has_text text="GGTTGACGGGGCTCAGGG" />
                                  </assert_contents>
- </discover_dataset>
-                        </output>
+ </element>
+                        </output_collection>
                 </test>
-                <test>
+ <test>
  <param name="inputs" value="test-abund-read-2.fa" />
  <param name="input_countgraph_filename"
  value="test-abund-read-2.oxlicg" ftype="oxlicg" />
  <param name="cutoff" value="1" />
-                        <output name="output">
- <discover_dataset name="test-abund-read-2.fa.abundfilt">
+                        <output_collection name="filter-abund-sequences" type="list">
+ <element name="filter-abund-sequence-0.abundfilt">
                                  <assert_contents>
                                          <has_text text="GGTTGACGGGGCTCAGGG" />
                                  </assert_contents>
- </discover_dataset>
-                        </output>
-                </test>
+ </element>
+                        </output_collection>
+ </test>
         </tests>
  <help><![CDATA[
 Trim sequences at a minimum k-mer abundance.
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 filter-below-abund.xml
--- a/filter-below-abund.xml Sat Sep 12 21:05:57 2015 -0400
+++ b/filter-below-abund.xml Fri Sep 18 17:48:50 2015 -0400
[
@@ -1,6 +1,6 @@
 <tool id="gedlab-khmer-filter-below-abund"
  name="Filter below abundance cutoff of 50"
- version="2.0-1">
+ version="2.0-2">
 
  <description>
  Trims fastq/fasta sequences at k-mers with abundance below 50
@@ -13,13 +13,15 @@
  <expand macro="requirements" />
  <expand macro="stdio" />
  <expand macro="version" />
- <command>
-mkdir output; cd output;
+ <command><![CDATA[
+set -xu &&
+#for $num, $input in enumerate($inputs)
+ln -s ${input} sequence-${num} &&
+mkdir output && cd output &&
 @BINARY@
-$input_countgraph_filename
-#for input in $inputs
- $input
-#end for
+${input_countgraph_filename}
+../sequence*
+]]>
  </command>
 
  <inputs>
@@ -27,8 +29,9 @@
  <expand macro="input_countgraph_filename" />
  </inputs>
  <outputs>
- <!-- <expand macro="output_sequences" /> -->
- <expand macro="output_sequences_single" />
+ <collection name="sequence_files" type="list">
+ <discover_datasets pattern="__name__" directory="output" />
+ </collection>
  </outputs>
  <!--        <tests>
                 <test>
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 macros.xml
--- a/macros.xml Sat Sep 12 21:05:57 2015 -0400
+++ b/macros.xml Fri Sep 18 17:48:50 2015 -0400
b
@@ -67,7 +67,8 @@
  <param name="tablesize_specific"
  type="text"
  label="tablesize"
- help="lower bound on the tablesize to use" />
+ help="(--max-tablesize) upper bound on the tablesize to use"
+         value="1000000.0" />
  </when>
  </conditional>
  </xml>
@@ -105,7 +106,7 @@
                 <data   name="output"
                         format_source="inputs"
                         label="${tool.name} processed nucleotide sequence file">
-                        <discover_datasets pattern="__name__" directory="output" visible="true"/>
+                        <discover_datasets pattern="__designation_and_ext__" directory="output" visible="true"/>
                 </data>
  </xml>
  <xml name="output_sequences_single">
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 normalize-by-median.cwl
--- a/normalize-by-median.cwl Sat Sep 12 21:05:57 2015 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,72 +0,0 @@
-#!/usr/bin/env cwl-runner
-- arguments:
-  - valueFrom: {engine: '#galaxy_command_line', script: " \nmkdir output;\ncd output;\n\
-        normalize-by-median.py\n$paired_switch\n@TABLEPARAMS@\n--cutoff=$cutoff\n\
-        #if $save_countingtable\n--savetable=$countingtable\n#end if\n#if $countingtable_to_load\n\
-        --loadtable=$countingtable_to_load\n#end if\n--report-total-kmers\n#for entry\
-        \ in $many_inputs\n#for input in $entry.inputs\n$input\n#end for\n#end for\n\
-        --out=$output\n"}
-  baseCommand: [/bin/sh, -c]
-  class: CommandLineTool
-  id: '#gedlab-khmer-normalize-by-median'
-  inputs:
-  - id: '#many_inputs'
-    type:
-      items:
-        fields:
-        - {label: 'FAST[AQ] file(s)', name: inputs, type: File}
-        name: many_inputs
-        type: record
-      type: array
-  - default: ''
-    id: '#paired_switch'
-    label: Are the inputs interleaved paired ends?
-    type:
-      name: paired_switch
-      symbols: ['', --paired]
-      type: enum
-  - {id: '#countingtable_to_load', label: an optional k-mer counting table to load,
-    type: File}
-  - {default: 'false', id: '#save_countingtable', label: Save the k-mer counting table(s)
-      in a file, type: boolean}
-  - {default: 20, id: '#cutoff', label: cutoff, type: int}
-  - id: '#parameters'
-    type:
-    - fields:
-      - label: Sample Type
-        name: tablesize
-        type:
-          name: tablesize
-          symbols: [1e9, 2e9, 4e9, 16e9]
-          type: enum
-      - name: type
-        type:
-          name: simple2
-          symbols: [simple]
-          type: enum
-      name: simple
-      type: record
-    - fields:
-      - {default: 20, label: ksize, name: ksize, type: int}
-      - {default: 4, label: n_tables, name: n_tables, type: int}
-      - {label: tablesize, name: tablesize_specific, type: string}
-      - name: type
-        type:
-          name: specific2
-          symbols: [specific]
-          type: enum
-      name: specific
-      type: record
-  - {default: countingtable, id: '#countingtable', type: string}
-  label: Normalize By Median
-  outputs:
-  - id: '#countingtable2'
-    outputBinding: {glob: countingtable}
-    type: File
-  requirements:
-  - {class: ExpressionEngineRequirement, engineCommand: ./galaxy-command-line.py,
-    id: '#galaxy_command_line'}
-  - {class: ExpressionEngineRequirement, engineCommand: ./galaxy-template.py, id: '#galaxy_template'}
-  - class: EnvVarRequirement
-    envDef:
-    - {envName: GALAXY_SLOTS, envValue: ''}
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 normalize-by-median.xml
--- a/normalize-by-median.xml Sat Sep 12 21:05:57 2015 -0400
+++ b/normalize-by-median.xml Fri Sep 18 17:48:50 2015 -0400
[
@@ -1,6 +1,6 @@
 <tool id="gedlab-khmer-normalize-by-median"
  name="Normalize By Median"
- version="2.0-3">
+ version="2.0-4">
 
  <description>
  Filters a fastq/fasta file using digital normalization via
@@ -14,35 +14,32 @@
  <expand macro="stdio" />
  <expand macro="version" />
  <command><![CDATA[ 
-mkdir output;
-cd output;
+set -xu &&
+#for $num, $input in enumerate($inputs)
+ln -s ${input} sequence-${num} &&
+#end for
+mkdir output &&
+cd output &&
 normalize-by-median.py
-$paired_switch
-$force_single_switch
+${paired_switch}
+${force_single_switch}
 @TABLEPARAMS@
---cutoff=$cutoff
+--cutoff=${cutoff}
 #if $unpaired_reads_filename
---unpaired-reads=$unpaired_reads_filename
+--unpaired-reads=${unpaired_reads_filename}
 #end if
 #if $save_countgraph
---savegraph=$countgraph
+--savegraph=${countgraph}
 #end if
 #if $countgraph_to_load
---loadgraph=$countgraph_to_load
+--loadgraph=${countgraph_to_load}
 #end if
---report-total-kmers
-#for entry in $many_inputs
-#for input in $entry.inputs
-$input
-#end for
-#end for
---out=$output
+--report=${report}
+../sequence-*
 ]]>
  </command>
  <inputs>
- <repeat name="many_inputs" title="input(s) set" min="1" default="1">
- <expand macro="input_sequences_filenames" />
- </repeat>
+ <expand macro="input_sequences_filenames" />
  <param name="paired_switch"
  type="boolean"
  checked="false"
@@ -88,53 +85,60 @@
  label="${tool.name} k-mer countgraph">
  <filter>save_countgraph == True</filter>
  </data>
- <!-- <expand macro="output_sequences" /> -->
- <expand macro="output_sequences_single" />
+ <data   name="report"
+ format="txt"
+ label="${tool.name} report">
+ </data>
+ <collection name="sequences" type="list">
+ <discover_datasets pattern="__name__" directory="output" />
+ </collection>
  </outputs>
  <tests>
  <test>
- <conditional name="parameters">
- <param name="type" value="specific" />
- <param name="inputs" value="test-abund-read-2.fa"/>
- <param name="cutoff" value="1" />
- <param name="ksize" value="17" />
- </conditional>
- <output name="output">
- <discover_dataset name="test-abund-read-2.fa.keep">
+ <param name="inputs" value="test-abund-read-2.fa"/>
+ <param name="type" value="specific" />
+ <param name="cutoff" value="1" />
+ <param name="ksize" value="17" />
+ <output name="report" file="normalize-by-median.report.txt" />
+ <output_collection name="sequences" type="list">
+ <element name="sequence-0.keep">
  <assert_contents>
- <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
+ <has_text text="GGTTGACGGGGCTCAGGGGG" />
  </assert_contents>
- </discover_dataset>
- </output>
+ </element>
+ </output_collection>
  </test>
  <test>
  <param name="inputs" value="test-abund-read-2.fa" />
+ <param name="type" value="specific" />
  <param name="cutoff" value="2" />
  <param name="ksize" value="17" />
- <output name="output">
- <discover_dataset name="test-abund-read-2.fa.keep">
+ <output name="report" file="normalize-by-median.c2.report.txt" />
+ <output_collection name="sequences" type="list">
+ <element name="sequence-0.keep">
  <assert_contents>
- <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
- <has_line_matching expression="GGTTGACGGGGCTCAGGG" />
+ <has_text text="GGTTGACGGGGCTCAGGGGG" />
+ <has_text text="GGTTGACGGGGCTCAGGG" />
  </assert_contents>
- </discover_dataset>
- </output>
+ </element>
+ </output_collection>
  </test>
  <test>
  <param name="inputs" value="test-abund-read-paired.fa" />
+ <param name="type" value="specific" />
  <param name="cutoff" value="1" />
  <param name="ksize" value="17" />
  <param name="paired" value="true" />
- <output name="output">
- <discover_dataset name="test-abund-read-paired.fa.keep">
+ <output name="report" file="normalize-by-median.paired.report.txt" />
+ <output_collection name="sequences" type="list">
+ <element name="sequence-0.keep">
  <assert_contents>
- <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
- <has_line_matching expression="GGTTGACGGGGCTCAGGG" />
+ <has_text text="GGTTGACGGGGCTCAGGGGG" />
+ <has_text text="GGTTGACGGGGCTCAGGG" />
  </assert_contents>
- </discover_dataset>
- </output>
+ </element>
+ </output_collection>
  </test>
-
  </tests>
  <help><![CDATA[
 Do digital normalization (remove mostly redundant sequences)
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 repository_dependencies.xml
--- a/repository_dependencies.xml Sat Sep 12 21:05:57 2015 -0400
+++ b/repository_dependencies.xml Fri Sep 18 17:48:50 2015 -0400
b
@@ -1,5 +1,4 @@
 <?xml version="1.0"?>
 <repositories description="We require the khmer package and the oxli datatype definitions.">
- <repository changeset_revision="3641a7d3b7c4" name="oxli_datatypes" owner="crusoe" toolshed="https://toolshed.g2.bx.psu.edu" />
  <repository changeset_revision="c7702a7d26e8" name="package_khmer_2_0" owner="crusoe" toolshed="https://toolshed.g2.bx.psu.edu" />
 </repositories>
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 test-data/normalize-by-median.c2.report.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/normalize-by-median.c2.report.txt Fri Sep 18 17:48:50 2015 -0400
b
@@ -0,0 +1,2 @@
+total,kept,f_kept
+1001,2,0.001998
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 test-data/normalize-by-median.paired.report.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/normalize-by-median.paired.report.txt Fri Sep 18 17:48:50 2015 -0400
b
@@ -0,0 +1,2 @@
+total,kept,f_kept
+6,2,0.3333
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 test-data/normalize-by-median.report.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/normalize-by-median.report.txt Fri Sep 18 17:48:50 2015 -0400
b
@@ -0,0 +1,2 @@
+total,kept,f_kept
+1001,1,0.000999
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 test-data/random-20-a.fa.part
--- a/test-data/random-20-a.fa.part Sat Sep 12 21:05:57 2015 -0400
+++ b/test-data/random-20-a.fa.part Fri Sep 18 17:48:50 2015 -0400
b
@@ -1,18 +1,18 @@
 >35 2
 CGCAGGCTGGATTCTAGAGGCAGAGGTGAGCTATAAGATATTGCATACGTTGAGCCAGC
->16 3
+>16 2
 CGGAAGCCCAATGAGTTGTCAGAGTCACCTCCACCCCGGGCCCTGTTAGCTACGTCCGT
->46 3
+>46 2
 GGTCGTGTTGGGTTAACAAAGGATCCCTGACTCGATCCAGCTGGGTAGGGTAACTATGT
 >40 2
 GGCTGAAGGAGCGGGCGTACGTGTTTACGGCATGATGGCCGGTGATTATGGGGGACGGG
->33 3
+>33 2
 GCAGCGGCTTTGAATGCCGAATATATAACAGCGACGGGGTTCAATAAGCTGCACATGCG
->98 0
+>98 2
 ACCAGATGCATAGCCCAACAGCTGAGACATTCCCAGCTCGCGAACCAAGACGTGAGAGC
 >17 2
 CCCTGTTAGCTACGTCCGTCTAAGGATATTAACATAGTTGCGACTGCGTCCTGTGCTCA
->89 3
+>89 2
 GCGAGATACTAGCAAAGGTTCATCAACAGCTACACCCGACGAACCCCGAGAAATTGGGA
 >30 2
 GTTATGGTCCAGGATGAATGCGCGTACCGGGCGCCTATCACTCCTCTTGTCATTCAGAA
@@ -22,17 +22,17 @@
 GTTTTTGTCATCGTGCATAAAGCGGGACAGAGTTCAACGGTATTCGAATGCACACCCTA
 >83 2
 CCTTCGGGGTGGAGCTGTTAATGAACTCAAGTGGCGATGGAGGCTAAAACGATACGTTG
->12 3
+>12 2
 AGCCAATTGTAACCATATGGTATCCAGTTTCCGTAGCAGCAATGCGCGACGGGCAATCG
 >85 2
 CGTGATATGATTACTAAAGGGGCCCGCAAAAACCCATTCACTGAGGGCTCTGTCCGTAC
 >2 2
 CCCGTGGGGCGGGCTAATTTTAAAGGCAGGTTGCTACACGTCAACTCTACCCAAGCTCC
->45 3
+>45 2
 ATACGCCACTCGACTTGGCTCGCCCTCGATCTAAAATAGCGGTCGTGTTGGGTTAACAA
->11 4
+>11 2
 GCAGCAGACCAACATCCAACACTTTTCACAAGAGGCTGACAGCCAATTGTAACCATATG
->39 4
+>39 2
 CAATTGACTTCCATGTGGGTCGGCTGTCAAGTCTAAACCGGGCTGAAGGAGCGGGCGTA
 >26 2
 AACATCTTAACCTCTGATCCCAACATGAGGGACATGAGTTTTCAAAGTAACGATGCGCA
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 test-data/random-20-a.fa.part.info
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/random-20-a.fa.part.info Fri Sep 18 17:48:50 2015 -0400
b
@@ -0,0 +1,1 @@
+1 subsets total
b
diff -r 3d90a3a78c3b -r 2c4635f5be47 test-data/random-20-a.part.extract.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/random-20-a.part.extract.fa Fri Sep 18 17:48:50 2015 -0400
b
@@ -0,0 +1,198 @@
+>35 2
+CGCAGGCTGGATTCTAGAGGCAGAGGTGAGCTATAAGATATTGCATACGTTGAGCCAGC
+>16 2
+CGGAAGCCCAATGAGTTGTCAGAGTCACCTCCACCCCGGGCCCTGTTAGCTACGTCCGT
+>46 2
+GGTCGTGTTGGGTTAACAAAGGATCCCTGACTCGATCCAGCTGGGTAGGGTAACTATGT
+>40 2
+GGCTGAAGGAGCGGGCGTACGTGTTTACGGCATGATGGCCGGTGATTATGGGGGACGGG
+>33 2
+GCAGCGGCTTTGAATGCCGAATATATAACAGCGACGGGGTTCAATAAGCTGCACATGCG
+>98 2
+ACCAGATGCATAGCCCAACAGCTGAGACATTCCCAGCTCGCGAACCAAGACGTGAGAGC
+>17 2
+CCCTGTTAGCTACGTCCGTCTAAGGATATTAACATAGTTGCGACTGCGTCCTGTGCTCA
+>89 2
+GCGAGATACTAGCAAAGGTTCATCAACAGCTACACCCGACGAACCCCGAGAAATTGGGA
+>30 2
+GTTATGGTCCAGGATGAATGCGCGTACCGGGCGCCTATCACTCCTCTTGTCATTCAGAA
+>82 2
+ATGCACTATATTTAAGAGGTCTAGAGTGTAAAAAGTGTACCCTTCGGGGTGGAGCTGTT
+>60 2
+GTTTTTGTCATCGTGCATAAAGCGGGACAGAGTTCAACGGTATTCGAATGCACACCCTA
+>83 2
+CCTTCGGGGTGGAGCTGTTAATGAACTCAAGTGGCGATGGAGGCTAAAACGATACGTTG
+>12 2
+AGCCAATTGTAACCATATGGTATCCAGTTTCCGTAGCAGCAATGCGCGACGGGCAATCG
+>85 2
+CGTGATATGATTACTAAAGGGGCCCGCAAAAACCCATTCACTGAGGGCTCTGTCCGTAC
+>2 2
+CCCGTGGGGCGGGCTAATTTTAAAGGCAGGTTGCTACACGTCAACTCTACCCAAGCTCC
+>45 2
+ATACGCCACTCGACTTGGCTCGCCCTCGATCTAAAATAGCGGTCGTGTTGGGTTAACAA
+>11 2
+GCAGCAGACCAACATCCAACACTTTTCACAAGAGGCTGACAGCCAATTGTAACCATATG
+>39 2
+CAATTGACTTCCATGTGGGTCGGCTGTCAAGTCTAAACCGGGCTGAAGGAGCGGGCGTA
+>26 2
+AACATCTTAACCTCTGATCCCAACATGAGGGACATGAGTTTTCAAAGTAACGATGCGCA
+>75 2
+GTCGGTGCCCGCGTGCGGAGCAGTCTTGATCCGGCGCGCTCTTACCTATGGTCGGCACG
+>81 2
+GGCTACTGGTTGATAAGCGTACGTAAAAGGCGAGTCTTACATGCACTATATTTAAGAGG
+>97 2
+ATTAGTGTGACTAGCCGAGTGCCCCAGCGTTTATCCAATGACCAGATGCATAGCCCAAC
+>13 2
+AATGCGCGACGGGCAATCGCGTCTGCGTTGATCGTCGCCCCTATTGTCGCTCCCTTAGT
+>92 2
+ATCAGGGCAAATTTGCTCGTGACTAAATGGTAATACTACCCGGGACAGTAAACTTTTGG
+>56 2
+AGATCTGCTTGGGTGTATCCCCATTCAGAGATACCAGATCTAAGCGACCATCAGAAACA
+>61 2
+TATTCGAATGCACACCCTAACATACTGGAAGATTCACTCTATATACCGGGAACTACTAA
+>96 2
+ATTAGACCGCTATCAACTCTTGCGAGGAAGGTCTGGGCCTATTAGTGTGACTAGCCGAG
+>31 2
+CTCCTCTTGTCATTCAGAAGGAATTTGATTAATTACCTGGGCTGACTCGCGCCCCCTGC
+>29 2
+TGGAAGCGCCCTCCGCTCAGGCGTTTTAGTAGATCCCAGTGTTATGGTCCAGGATGAAT
+>54 2
+TGGATGAGGTCCTTAAGGCCTAATTGACCAATCGCCCCAAGATTGGTGGTGAATGACTC
+>0 2
+TAGTGATCAGCGGCTAGTGTCGCCCCTCTTAGCACCTTGCGATCATCGAATCGGGCTGT
+>90 2
+GAACCCCGAGAAATTGGGAAGCCTGGAGGCAGTACAGTCATCCAGTCTGCTGCTCAAAG
+>34 2
+TCAATAAGCTGCACATGCGTGGTTGTGGCACGATCAGTTCCGCAGGCTGGATTCTAGAG
+>43 2
+AGGACTCGACGTCCGCCCCATGCTTGAGAGAAGGTTTCGGCCAACCATGGTAGGTTAGG
+>8 2
+ACACACAAGGCCAGACACCAACTTGGCCGTGGAATTTATCAACACTTCTGAGACGAAGG
+>37 2
+TGTGCGCTGTGAGATACAACTATAGGCACCGGGTTGCTGGCTAATAACCATTTAGAGTC
+>51 2
+ACACAATGGACGCGTTAAGGAGAACCGGTCGCAACCAGGTTGAAAATGCCTGATATACG
+>32 2
+GCTGACTCGCGCCCCCTGCAGGCTGCTATGATTGAGTGCGGCAGCGGCTTTGAATGCCG
+>78 2
+TCTGGGGCGAGATCCCCTCTGCTCACTTTCTTGTAGTAAATACACCGAAGGGGCGAACC
+>18 2
+CGACTGCGTCCTGTGCTCAGTTCGTGACGCCGAACTCAAGGACGCGGTACGAAGAACTG
+>36 2
+TTGCATACGTTGAGCCAGCGCCGCCCGTATACACAGGGTCTGTGCGCTGTGAGATACAA
+>53 2
+ATATAAGTTTTTTAGATGTAAAAAATTTTTTATGGCGGCCTGGATGAGGTCCTTAAGGC
+>24 2
+AAGAAACAGGCTAGGTCTTCCATGCAATGGTTCTCACAGTGTAGTCGCGCATCAACTCC
+>7 2
+AAACGTCTAAGTAATCATGCGACCGGCGCCTCGATTGGACACACACAAGGCCAGACACC
+>9 2
+AACACTTCTGAGACGAAGGTCATTTACGATTGGGACACTTTCTCGAACTCCGGTTAATT
+>47 2
+CTGGGTAGGGTAACTATGTAGCCATCGCTCAGTGGATTCTTCCGGGATAGGGTGTGCGA
+>62 2
+ATATACCGGGAACTACTAAAATTTTGGGCTACTCTATGCTTACAGCCCAACATGCGCAA
+>79 2
+TACACCGAAGGGGCGAACCCTGTCTACATTCGCAAATGCATCCTACCTGAGAGGCTTCG
+>48 2
+TCCGGGATAGGGTGTGCGAATGTGCCGGGCATTCAGCTCCTTAGAGACGAGTTACGAGC
+>66 2
+GGCGCGACCAATATTCATTTGATGAGAATTGAAATCGACTGAATCACGGGATTTATACA
+>25 2
+GTAGTCGCGCATCAACTCCGCCAGTTTTATCGAAGCGCCCAACATCTTAACCTCTGATC
+>5 2
+TCATTACGGGGTGTCCATCTAGAGAAAGTGGGTTTCCCTTATAGAAATGAGGAGGATTC
+>72 2
+ATAAAAAACGACTTCTAAAGCGACACTGGTTTTATCCTTCCCTGTTTTCCTCGCCCCAT
+>76 2
+CTTACCTATGGTCGGCACGATTCCATTGGCGGATATAGGATTGATTACGTGTGTTTACT
+>69 2
+GCAGCGAGGTATTTAAACTGTTCAATCGGCGCAACCGAAAATCTGCTACCGTGGTTGCT
+>87 2
+CAGTATACGCCCGTTGAGAAACAGGTGGTGGCGCAGTGTCGATTACTTCGTAATAATTT
+>27 2
+TTCAAAGTAACGATGCGCAGATTGAATAATGCCATATCTGCGCGAGAGGTTTCAGGTAC
+>77 2
+TTGATTACGTGTGTTTACTATACCGGTAGAAGCCTTCAGTTCTGGGGCGAGATCCCCTC
+>95 2
+TACGTGTGGCATCGTTGCACCCTAATTCGCATTATTAAGTATTAGACCGCTATCAACTC
+>63 2
+TACAGCCCAACATGCGCAACAACTATAAGCTGCTGCTGACAGATCCGTTTGTTCCGGAC
+>38 2
+CTAATAACCATTTAGAGTCGCCCGCGGTGATGAGTAATCGCAATTGACTTCCATGTGGG
+>20 2
+GTGCCTACCGTACCTGTCGAGCCAGTGCGATCAGTAAAACTACCGATTCGTGGCCTCCC
+>88 2
+GATTACTTCGTAATAATTTGAGGGTGCTGCCGCGTGTTCCGCGAGATACTAGCAAAGGT
+>49 2
+TTAGAGACGAGTTACGAGCCACTCTTGGATCGTCATGCATACCTCGCAGATCGGCAGAG
+>91 2
+TCCAGTCTGCTGCTCAAAGTCCATCTACATGTAAAGAACCATCAGGGCAAATTTGCTCG
+>86 2
+CTGAGGGCTCTGTCCGTACGTGTACTATAGATCCTTGCTCCAGTATACGCCCGTTGAGA
+>42 2
+CATATTTCAGGCGTGCGCCAACTTACGATTCTTGAATCCAAGGACTCGACGTCCGCCCC
+>70 2
+ATCTGCTACCGTGGTTGCTTCGACCATGGTAAACTGAGTAAGCCCTTATGAGTTGCGGG
+>19 2
+GACGCGGTACGAAGAACTGCTCCAGCAACAGCATTCCTTGGTGCCTACCGTACCTGTCG
+>84 2
+AGGCTAAAACGATACGTTGTATACTAAGAACTGTCTACATCGTGATATGATTACTAAAG
+>52 2
+TGAAAATGCCTGATATACGAAGATTAAGCGGCTTTGGATCATATAAGTTTTTTAGATGT
+>71 2
+AGCCCTTATGAGTTGCGGGTCGTGCTGTTAGACTGAACACATAAAAAACGACTTCTAAA
+>93 2
+CGGGACAGTAAACTTTTGGTGATGCCAGCACGACCAGCGCAGGGTCAAGAAAACTATTA
+>58 2
+TCGTGGTACACCCGGAGTCTCGAAAGGAGCTTGCAAAGCTTTTCAGCATGGGTCGCATT
+>22 2
+TTCATTCCCCTGTAACGTTTCGAACTCAACTTGCTTGCCCGACATATGGCGGTACGCGG
+>50 2
+ACCTCGCAGATCGGCAGAGAACGGTTTGGTCTGTTTGCGTACACAATGGACGCGTTAAG
+>21 2
+TACCGATTCGTGGCCTCCCGTTCGTCGCAATGAACGGCTTTTCATTCCCCTGTAACGTT
+>73 2
+CCTGTTTTCCTCGCCCCATGCAATGGTAACTAATATACCGCCCCATAGTCTTAATAACC
+>68 2
+CTGTCCCAACGGTAACAATGGAGGCACTATACCGACGCTCGCAGCGAGGTATTTAAACT
+>23 2
+GACATATGGCGGTACGCGGGCTCAGCGCTCCGCCAGTAAGAAGAAACAGGCTAGGTCTT
+>94 2
+AGGGTCAAGAAAACTATTAATTTAAGCGCTGTTTAGTAACTACGTGTGGCATCGTTGCA
+>10 2
+TCTCGAACTCCGGTTAATTTGCAATCCGGGGGTTTGCTCAGCAGCAGACCAACATCCAA
+>41 2
+GGTGATTATGGGGGACGGGTATAGTACTAATAGTTTTGGGCATATTTCAGGCGTGCGCC
+>80 2
+TCCTACCTGAGAGGCTTCGACTAAAGAATGCGGGTATACTGGCTACTGGTTGATAAGCG
+>64 2
+AGATCCGTTTGTTCCGGACGGTCGTCGTACCCACCCCTTGTCGATAGGTAAAGGAGTAA
+>57 2
+TAAGCGACCATCAGAAACACAGCATCAGCTTACCAGCCTTTCGTGGTACACCCGGAGTC
+>1 2
+GATCATCGAATCGGGCTGTCGCCAAAGGCCGACCAAGGTTCCCGTGGGGCGGGCTAATT
+>55 2
+GATTGGTGGTGAATGACTCACAAAATGCTCATAGAATATTAGATCTGCTTGGGTGTATC
+>67 2
+GAATCACGGGATTTATACATCATTTATAGCTAAATTACACCTGTCCCAACGGTAACAAT
+>14 2
+CTATTGTCGCTCCCTTAGTTGTTGGGCGTAGTCCGCACCTAGAGTCCAACCAGGCCTCG
+>15 2
+AGAGTCCAACCAGGCCTCGACAATCCTTTGTCCTGTCCCCCGGAAGCCCAATGAGTTGT
+>59 2
+TTTCAGCATGGGTCGCATTCCTACCTAAGGCTAGGGGCATGTTTTTGTCATCGTGCATA
+>28 2
+CGCGAGAGGTTTCAGGTACCTATCGGGACAGACTTGTTTCTGGAAGCGCCCTCCGCTCA
+>74 2
+CCCCATAGTCTTAATAACCGACACCGAGACGCTACATGGCGTCGGTGCCCGCGTGCGGA
+>4 2
+TGTAACCTGTGTGGGGTCGGTCCTGGGGAAACTTTGGGTTTCATTACGGGGTGTCCATC
+>65 2
+TCGATAGGTAAAGGAGTAAGCGTCCGACTCCCTCTTACTTGGCGCGACCAATATTCATT
+>6 2
+ATAGAAATGAGGAGGATTCACAGACACGTCAGTCACCATCAAACGTCTAAGTAATCATG
+>44 2
+CCAACCATGGTAGGTTAGGAAAGCCGCCAAATAAGTTCTTATACGCCACTCGACTTGGC
+>3 2
+TCAACTCTACCCAAGCTCCTTGCATCTCGGTACCCCCCCTTGTAACCTGTGTGGGGTCG