Mercurial > repos > peterjc > sample_seqs
annotate tools/sample_seqs/sample_seqs.xml @ 6:31f5701cd2e9 draft
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
author | peterjc |
---|---|
date | Thu, 11 May 2017 07:24:38 -0400 |
parents | 6b71ad5d43fb |
children | 86710edcec02 |
rev | line source |
---|---|
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
1 <tool id="sample_seqs" name="Sub-sample sequences files" version="0.2.4"> |
0 | 2 <description>e.g. to reduce coverage</description> |
3 <requirements> | |
6
31f5701cd2e9
v0.2.4 Depends on Biopython 1.67 via legacy Tool Shed package or bioconda.
peterjc
parents:
5
diff
changeset
|
4 <requirement type="package" version="1.67">biopython</requirement> |
0 | 5 </requirements> |
4 | 6 <stdio> |
7 <!-- Anything other than zero is an error --> | |
8 <exit_code range="1:" /> | |
9 <exit_code range=":-1" /> | |
10 </stdio> | |
0 | 11 <version_command interpreter="python">sample_seqs.py --version</version_command> |
12 <command interpreter="python"> | |
2 | 13 sample_seqs.py -f "$input_file.ext" -i "$input_file" -o "$output_file" |
0 | 14 #if str($sampling.type) == "everyNth": |
2 | 15 -n "${sampling.every_n}" |
0 | 16 #elif str($sampling.type) == "percentage": |
2 | 17 -p "${sampling.percent}" |
18 #else | |
19 -c "${sampling.count}" | |
20 #end if | |
21 #if $interleaved | |
22 --interleaved | |
0 | 23 #end if |
24 </command> | |
25 <inputs> | |
26 <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file" help="FASTA, FASTQ, or SFF format." /> | |
27 <conditional name="sampling"> | |
28 <param name="type" type="select" label="Sub-sampling approach"> | |
2 | 29 <option value="everyNth">Take every N-th sequence (or pair, e.g. every fifth sequence)</option> |
30 <option value="percentage">Take some percentage of the sequences (or pairs, e.g. 20% will take every fifth sequence)</option> | |
31 <option value="desired_count">Take exactly N sequences (or pairs, e.g. 1000 sequences)</option> | |
0 | 32 <!-- TODO - target coverage etc --> |
33 </param> | |
34 <when value="everyNth"> | |
35 <param name="every_n" value="5" type="integer" min="2" label="N" help="At least 2, e.g. 5 will take every 5th sequence (taking 20% of the sequences)" /> | |
36 </when> | |
37 <when value="percentage"> | |
38 <param name="percent" value="20.0" type="float" min="0" max="100" label="Percentage" help="Between 0 and 100, e.g. 20% will take every 5th sequence" /> | |
39 </when> | |
2 | 40 <when value="desired_count"> |
41 <param name="count" value="1000" type="integer" min="1" label="N" help="Number of unique sequences to pick (between 1 and number itotal n input file)" /> | |
42 </when> | |
0 | 43 </conditional> |
2 | 44 <param name="interleaved" type="boolean" label="Interleaved paired reads" help="This mode keeps paired reads together (e.g. take every 5th read pair)" /> |
0 | 45 </inputs> |
46 <outputs> | |
4 | 47 <data name="output_file" format_source="input_file" metadata_source="input_file" label="${input_file.name} (sub-sampled)"/> |
0 | 48 </outputs> |
49 <tests> | |
50 <test> | |
51 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> | |
52 <param name="type" value="everyNth" /> | |
53 <param name="every_n" value="100" /> | |
54 <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" /> | |
55 </test> | |
56 <test> | |
57 <param name="input_file" value="ecoli.fastq" /> | |
58 <param name="type" value="everyNth" /> | |
59 <param name="every_n" value="100" /> | |
60 <output name="output_file" file="ecoli.sample_N100.fastq" /> | |
61 </test> | |
62 <test> | |
2 | 63 <param name="input_file" value="ecoli.fastq" /> |
64 <param name="type" value="everyNth" /> | |
65 <param name="every_n" value="100" /> | |
66 <param name="interleaved" value="true" /> | |
67 <output name="output_file" file="ecoli.pair_sample_N100.fastq" /> | |
68 </test> | |
69 <test> | |
0 | 70 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> |
71 <param name="type" value="everyNth" /> | |
72 <param name="every_n" value="5" /> | |
73 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/> | |
74 </test> | |
75 <test> | |
76 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> | |
77 <param name="type" value="percentage" /> | |
78 <param name="percent" value="1.0" /> | |
79 <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" /> | |
80 </test> | |
81 <test> | |
2 | 82 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> |
83 <param name="type" value="everyNth" /> | |
84 <param name="every_n" value="100" /> | |
85 <param name="interleaved" value="true" /> | |
86 <output name="output_file" file="get_orf_input.Suis_ORF.prot.pair_sample_N100.fasta" /> | |
87 </test> | |
88 <test> | |
89 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> | |
90 <param name="type" value="desired_count" /> | |
91 <param name="count" value="2910" /> | |
92 <output name="output_file" file="get_orf_input.Suis_ORF.prot.fasta" /> | |
93 </test> | |
94 <test> | |
95 <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> | |
96 <param name="type" value="desired_count" /> | |
97 <param name="count" value="10" /> | |
98 <param name="interleaved" value="true" /> | |
99 <output name="output_file" file="get_orf_input.Suis_ORF.prot.pair_sample_C10.fasta" /> | |
100 </test> | |
101 <test> | |
0 | 102 <param name="input_file" value="ecoli.fastq" /> |
103 <param name="type" value="percentage" /> | |
104 <param name="percent" value="1.0" /> | |
105 <output name="output_file" file="ecoli.sample_N100.fastq" /> | |
106 </test> | |
107 <test> | |
2 | 108 <param name="input_file" value="ecoli.fastq" /> |
109 <param name="type" value="desired_count" /> | |
110 <param name="count" value="10" /> | |
111 <output name="output_file" file="ecoli.sample_C10.fastq" /> | |
112 </test> | |
113 <test> | |
114 <param name="input_file" value="ecoli.sample_C10.fastq" /> | |
115 <param name="type" value="desired_count" /> | |
116 <param name="count" value="10" /> | |
117 <output name="output_file" file="ecoli.sample_C10.fastq" /> | |
118 </test> | |
119 <test> | |
0 | 120 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> |
121 <param name="type" value="percentage" /> | |
122 <param name="percent" value="20.0" /> | |
123 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/> | |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
124 <assert_stderr> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
125 <has_line line="Sampling 20.000% of sequences" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
126 <has_line line="Selected 5 records" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
127 </assert_stderr> |
0 | 128 </test> |
2 | 129 <test> |
130 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> | |
131 <param name="type" value="everyNth" /> | |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
132 <param name="every_n" value="5" /> |
2 | 133 <param name="interleaved" value="true" /> |
134 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff" ftype="sff"/> | |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
135 <assert_stderr> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
136 <has_line line="Sampling every 5th sequence" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
137 <has_line line="Selected 3 pairs" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
138 </assert_stderr> |
2 | 139 </test> |
140 <test> | |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
141 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> |
2 | 142 <param name="type" value="desired_count" /> |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
143 <param name="count" value="25" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
144 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff"/> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
145 <assert_stderr> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
146 <has_line line="Input file has 25 sequences" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
147 <has_line line="Taking all the sequences" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
148 <has_line line="Selected 25 records" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
149 </assert_stderr> |
2 | 150 </test> |
151 <test> | |
152 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> | |
153 <param name="type" value="desired_count" /> | |
154 <param name="count" value="1" /> | |
155 <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_C1.sff" ftype="sff"/> | |
3
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
156 <assert_stderr> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
157 <has_line line="Input file has 25 sequences" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
158 <has_line line="Sampling just first sequence!" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
159 <has_line line="Selected 1 records" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
160 </assert_stderr> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
161 </test> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
162 <test expect_failure="true" expect_exit_code="1"> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
163 <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
164 <param name="type" value="desired_count" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
165 <param name="count" value="30" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
166 <assert_stderr> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
167 <has_line line="Input file has 25 sequences" /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
168 <has_line line="Requested 30 sequences, but file only has 25." /> |
02c13ef1a669
Uploaded v0.2.1, fixed missing test file, more tests.
peterjc
parents:
2
diff
changeset
|
169 </assert_stderr> |
2 | 170 </test> |
0 | 171 </tests> |
172 <help> | |
173 **What it does** | |
174 | |
175 Takes an input file of sequences (typically FASTA or FASTQ, but also | |
176 Standard Flowgram Format (SFF) is supported), and returns a new sequence | |
2 | 177 file sub-sampling uniformly from this (in the same format, preserving the |
178 input order and selecting sequencing evenly though the input file). | |
0 | 179 |
2 | 180 Several sampling modes are supported, all designed to do non-random |
181 uniform sampling (i.e. evenly through the input file). This allows | |
182 reproducibility, and also works on paired sequence files (run the tool | |
183 twice, once on each file using the same settings). | |
0 | 184 |
2 | 185 By sampling uniformly (evenly) through the file, this avoids any bias |
186 should reads in any part of the file be of lesser quality (e.g. for | |
187 high throughput sequencing the reads at the start and end of the file | |
188 can be of lower quality). | |
189 | |
190 The simplest mode is to take every *N*-th sequence, for example taking | |
0 | 191 every 2nd sequence would sample half the file - while taking every 5th |
192 sequence would take 20% of the file. | |
193 | |
2 | 194 The target count method picks *N* sequences from the input file, which |
195 again will be distributed uniformly (evenly) though the file. This works | |
196 by first counting the number of records, then calculating the desired | |
197 percentage of sequences to take. Note if your input file has exactly | |
198 *N* sequences this selects them all (effectively copying the input file). | |
199 If your input file has less than *N* sequences, this is treated as an | |
200 error. | |
201 | |
202 If you tick the interleaved option, the file is processed as pairs of | |
203 records to ensure your read pairs are not separated by sampling. | |
204 For example using 20% would take every 5th pair of records, or you | |
205 could request 1000 read pairs. | |
206 | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
4
diff
changeset
|
207 If instead of interleaved paired reads you have two matched files (one |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
4
diff
changeset
|
208 for each pair), run the tool twice with the same sampling options to |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
4
diff
changeset
|
209 make to matched smaller files. |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
4
diff
changeset
|
210 |
2 | 211 .. class:: warningmark |
212 | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
4
diff
changeset
|
213 Note interleaved/pair mode does *not* actually check your read names |
2 | 214 match a known pair naming scheme! |
0 | 215 |
216 **Example Usage** | |
217 | |
218 Suppose you have some Illumina paired end data as files ``R1.fastq`` and | |
219 ``R2.fastq`` which give an estimated x200 coverage, and you wish to do a | |
220 *de novo* assembly with a tool like MIRA which recommends lower coverage. | |
5
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
4
diff
changeset
|
221 Running the tool twice (on ``R1.fastq`` and ``R2.fastq``) taking every |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
4
diff
changeset
|
222 3rd read would reduce the estimated coverage to about x66, and would |
6b71ad5d43fb
v0.2.3 clarified help, internal cleanup of Python script
peterjc
parents:
4
diff
changeset
|
223 preserve the pairing as well (as two smaller FASTQ files). |
0 | 224 |
2 | 225 Similarly, if you had some Illumina paired end data interleaved into one |
226 file with an estimated x200 coverage, you would run this tool in | |
227 interleaved mode, taking every 3rd read pair. This would again reduce | |
228 the estimated coverage to about x66, while preserving the read pairing. | |
229 | |
230 Suppose you have a transcriptome assembly, and wish to look at the | |
231 species distribution of the top BLAST hits for an initial quality check. | |
232 Rather than using all your sequences, you could pick 1000 only for this. | |
0 | 233 |
234 **Citation** | |
235 | |
236 This tool uses Biopython, so if you use this Galaxy tool in work leading to a | |
237 scientific publication please cite the following paper: | |
238 | |
239 Cock et al (2009). Biopython: freely available Python tools for computational | |
240 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. | |
241 http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. | |
242 | |
243 This tool is available to install into other Galaxy Instances via the Galaxy | |
244 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs | |
245 </help> | |
2 | 246 <citations> |
247 <citation type="doi">10.1093/bioinformatics/btp163</citation> | |
248 </citations> | |
0 | 249 </tool> |