Mercurial > repos > peterjc > sample_seqs
annotate tools/sample_seqs/sample_seqs.xml @ 3:02c13ef1a669 draft
Uploaded v0.2.1, fixed missing test file, more tests.
author | peterjc |
---|---|
date | Fri, 27 Mar 2015 09:34:27 -0400 |
parents | da64f6a9e32b |
children | d3aa9f25c24c |
rev | line source |
---|---|
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
1 <tool id="sample_seqs" name="Sub-sample sequences files" version="0.2.1"> |
0 | 2 <description>e.g. to reduce coverage</description> |
3 <requirements> | |
2 | 4 <requirement type="package" version="1.65">biopython</requirement> |
0 | 5 <requirement type="python-module">Bio</requirement> |
6 </requirements> | |
7 <version_command interpreter="python">sample_seqs.py --version</version_command> | |
8 <command interpreter="python"> | |
2 | 9 sample_seqs.py -f "$input_file.ext" -i "$input_file" -o "$output_file" |
0 | 10 #if str($sampling.type) == "everyNth": |
2 | 11 -n "${sampling.every_n}" |
0 | 12 #elif str($sampling.type) == "percentage": |
2 | 13 -p "${sampling.percent}" |
14 #else | |
15 -c "${sampling.count}" | |
16 #end if | |
17 #if $interleaved | |
18 --interleaved | |
0 | 19 #end if |
20 </command> | |
21 <stdio> | |
22 <!-- Anything other than zero is an error --> | |
23 <exit_code range="1:" /> | |
24 <exit_code range=":-1" /> | |
25 </stdio> | |
26 <inputs> | |
27 <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file" help="FASTA, FASTQ, or SFF format." /> | |
28 <conditional name="sampling"> | |
29 <param name="type" type="select" label="Sub-sampling approach"> | |
2 | 30 <option value="everyNth">Take every N-th sequence (or pair, e.g. every fifth sequence)</option> |
31 <option value="percentage">Take some percentage of the sequences (or pairs, e.g. 20% will take every fifth sequence)</option> | |
32 <option value="desired_count">Take exactly N sequences (or pairs, e.g. 1000 sequences)</option> | |
0 | 33 <!-- TODO - target coverage etc --> |
34 </param> | |
35 <when value="everyNth"> | |
36 <param name="every_n" value="5" type="integer" min="2" label="N" help="At least 2, e.g. 5 will take every 5th sequence (taking 20% of the sequences)" /> | |
37 </when> | |
38 <when value="percentage"> | |
39 <param name="percent" value="20.0" type="float" min="0" max="100" label="Percentage" help="Between 0 and 100, e.g. 20% will take every 5th sequence" /> | |
40 </when> | |
2 | 41 <when value="desired_count"> |
42 <param name="count" value="1000" type="integer" min="1" label="N" help="Number of unique sequences to pick (between 1 and number itotal n input file)" /> | |
43 </when> | |
0 | 44 </conditional> |
2 | 45 <param name="interleaved" type="boolean" label="Interleaved paired reads" help="This mode keeps paired reads together (e.g. take every 5th read pair)" /> |
0 | 46 </inputs> |
47 <outputs> | |
48 <data name="output_file" format="input" metadata_source="input_file" label="${input_file.name} (sub-sampled)"/> | |
49 </outputs> | |
50 <tests> | |
51 <test> | |
52 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> | |
53 <param name="type" value="everyNth" /> | |
54 <param name="every_n" value="100" /> | |
55 <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" /> | |
56 </test> | |
57 <test> | |
58 <param name="input_file" value="ecoli.fastq" /> | |
59 <param name="type" value="everyNth" /> | |
60 <param name="every_n" value="100" /> | |
61 <output name="output_file" file="ecoli.sample_N100.fastq" /> | |
62 </test> | |
63 <test> | |
2 | 64 <param name="input_file" value="ecoli.fastq" /> |
65 <param name="type" value="everyNth" /> | |
66 <param name="every_n" value="100" /> | |
67 <param name="interleaved" value="true" /> | |
68 <output name="output_file" file="ecoli.pair_sample_N100.fastq" /> | |
69 </test> | |
70 <test> | |
0 | 71 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> |
72 <param name="type" value="everyNth" /> | |
73 <param name="every_n" value="5" /> | |
74 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/> | |
75 </test> | |
76 <test> | |
77 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> | |
78 <param name="type" value="percentage" /> | |
79 <param name="percent" value="1.0" /> | |
80 <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" /> | |
81 </test> | |
82 <test> | |
2 | 83 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> |
84 <param name="type" value="everyNth" /> | |
85 <param name="every_n" value="100" /> | |
86 <param name="interleaved" value="true" /> | |
87 <output name="output_file" file="get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta" /> | |
88 </test> | |
89 <test> | |
90 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> | |
91 <param name="type" value="desired_count" /> | |
92 <param name="count" value="2910" /> | |
93 <output name="output_file" file="get_orf_input.Suis_ORF.prot.fasta" /> | |
94 </test> | |
95 <test> | |
96 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> | |
97 <param name="type" value="desired_count" /> | |
98 <param name="count" value="10" /> | |
99 <param name="interleaved" value="true" /> | |
100 <output name="output_file" file="get_orf_input.Suis_ORF.prot.pair_sample_C10.fasta" /> | |
101 </test> | |
102 <test> | |
0 | 103 <param name="input_file" value="ecoli.fastq" /> |
104 <param name="type" value="percentage" /> | |
105 <param name="percent" value="1.0" /> | |
106 <output name="output_file" file="ecoli.sample_N100.fastq" /> | |
107 </test> | |
108 <test> | |
2 | 109 <param name="input_file" value="ecoli.fastq" /> |
110 <param name="type" value="desired_count" /> | |
111 <param name="count" value="10" /> | |
112 <output name="output_file" file="ecoli.sample_C10.fastq" /> | |
113 </test> | |
114 <test> | |
115 <param name="input_file" value="ecoli.sample_C10.fastq" /> | |
116 <param name="type" value="desired_count" /> | |
117 <param name="count" value="10" /> | |
118 <output name="output_file" file="ecoli.sample_C10.fastq" /> | |
119 </test> | |
120 <test> | |
0 | 121 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> |
122 <param name="type" value="percentage" /> | |
123 <param name="percent" value="20.0" /> | |
124 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/> | |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
125 <assert_stderr> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
126 <has_line line="Sampling 20.000% of sequences" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
127 <has_line line="Selected 5 records" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
128 </assert_stderr> |
0 | 129 </test> |
2 | 130 <test> |
131 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> | |
132 <param name="type" value="everyNth" /> | |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
133 <param name="every_n" value="5" /> |
2 | 134 <param name="interleaved" value="true" /> |
135 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff" ftype="sff"/> | |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
136 <assert_stderr> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
137 <has_line line="Sampling every 5th sequence" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
138 <has_line line="Selected 3 pairs" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
139 </assert_stderr> |
2 | 140 </test> |
141 <test> | |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
142 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> |
2 | 143 <param name="type" value="desired_count" /> |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
144 <param name="count" value="25" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
145 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff"/> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
146 <assert_stderr> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
147 <has_line line="Input file has 25 sequences" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
148 <has_line line="Taking all the sequences" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
149 <has_line line="Selected 25 records" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
150 </assert_stderr> |
2 | 151 </test> |
152 <test> | |
153 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> | |
154 <param name="type" value="desired_count" /> | |
155 <param name="count" value="1" /> | |
156 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_C1.sff" ftype="sff"/> | |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
157 <assert_stderr> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
158 <has_line line="Input file has 25 sequences" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
159 <has_line line="Sampling just first sequence!" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
160 <has_line line="Selected 1 records" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
161 </assert_stderr> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
162 </test> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
163 <test expect_failure="true" expect_exit_code="1"> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
164 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
165 <param name="type" value="desired_count" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
166 <param name="count" value="30" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
167 <assert_stderr> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
168 <has_line line="Input file has 25 sequences" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
169 <has_line line="Requested 30 sequences, but file only has 25." /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
170 </assert_stderr> |
2 | 171 </test> |
0 | 172 </tests> |
173 <help> | |
174 **What it does** | |
175 | |
176 Takes an input file of sequences (typically FASTA or FASTQ, but also | |
177 Standard Flowgram Format (SFF) is supported), and returns a new sequence | |
2 | 178 file sub-sampling uniformly from this (in the same format, preserving the |
179 input order and selecting sequencing evenly though the input file). | |
0 | 180 |
2 | 181 Several sampling modes are supported, all designed to do non-random |
182 uniform sampling (i.e. evenly through the input file). This allows | |
183 reproducibility, and also works on paired sequence files (run the tool | |
184 twice, once on each file using the same settings). | |
0 | 185 |
2 | 186 By sampling uniformly (evenly) through the file, this avoids any bias |
187 should reads in any part of the file be of lesser quality (e.g. for | |
188 high throughput sequencing the reads at the start and end of the file | |
189 can be of lower quality). | |
190 | |
191 The simplest mode is to take every *N*-th sequence, for example taking | |
0 | 192 every 2nd sequence would sample half the file - while taking every 5th |
193 sequence would take 20% of the file. | |
194 | |
2 | 195 The target count method picks *N* sequences from the input file, which |
196 again will be distributed uniformly (evenly) though the file. This works | |
197 by first counting the number of records, then calculating the desired | |
198 percentage of sequences to take. Note if your input file has exactly | |
199 *N* sequences this selects them all (effectively copying the input file). | |
200 If your input file has less than *N* sequences, this is treated as an | |
201 error. | |
202 | |
203 If you tick the interleaved option, the file is processed as pairs of | |
204 records to ensure your read pairs are not separated by sampling. | |
205 For example using 20% would take every 5th pair of records, or you | |
206 could request 1000 read pairs. | |
207 | |
208 .. class:: warningmark | |
209 | |
210 Note interleaves/pair mode does *not* actually check your read names | |
211 match a known pair naming scheme! | |
0 | 212 |
213 **Example Usage** | |
214 | |
215 Suppose you have some Illumina paired end data as files ``R1.fastq`` and | |
216 ``R2.fastq`` which give an estimated x200 coverage, and you wish to do a | |
217 *de novo* assembly with a tool like MIRA which recommends lower coverage. | |
218 Taking every 3rd read would reduce the estimated coverage to about x66, | |
219 and would preserve the pairing as well. | |
220 | |
2 | 221 Similarly, if you had some Illumina paired end data interleaved into one |
222 file with an estimated x200 coverage, you would run this tool in | |
223 interleaved mode, taking every 3rd read pair. This would again reduce | |
224 the estimated coverage to about x66, while preserving the read pairing. | |
225 | |
226 Suppose you have a transcriptome assembly, and wish to look at the | |
227 species distribution of the top BLAST hits for an initial quality check. | |
228 Rather than using all your sequences, you could pick 1000 only for this. | |
0 | 229 |
230 **Citation** | |
231 | |
232 This tool uses Biopython, so if you use this Galaxy tool in work leading to a | |
233 scientific publication please cite the following paper: | |
234 | |
235 Cock et al (2009). Biopython: freely available Python tools for computational | |
236 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
237 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
238 | |
239 This tool is available to install into other Galaxy Instances via the Galaxy | |
240 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs | |
241 </help> | |
2 | 242 <citations> |
243 <citation type="doi">10.1093/bioinformatics/btp163</citation> | |
244 </citations> | |
0 | 245 </tool> |