comparison fastq_dump.xml @ 7:c7620aa7e1f0 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sra-tools commit d1347141d384ed404f674d7ce408b6769e763ea1
author iuc
date Wed, 10 May 2017 10:45:41 -0400
parents 30775c836c77
children 1920e0508831
comparison
equal deleted inserted replaced
6:30775c836c77 7:c7620aa7e1f0
1 <tool id="fastq_dump" name="Extract reads" version="@VERSION@.1"> 1 <tool id="fastq_dump" name="Extract reads in Fastq/a" version="@VERSION@.2">
2 <description>in FASTQ/A format from NCBI SRA.</description> 2 <description>format from NCBI SRA</description>
3 <macros> 3 <macros>
4 <import>sra_macros.xml</import> 4 <import>sra_macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="requirements"/> 6 <expand macro="requirements"/>
7 <version_command>fastq-dump --version</version_command> 7 <version_command>fastq-dump --version</version_command>
8 <command detect_errors="exit_code"> 8 <command detect_errors="exit_code">
9 <![CDATA[ 9 <![CDATA[
10 10
11 #if $input.input_select=="file_list": 11 #if $input.input_select=="file_list":
12 for acc in `cat $input.file_list` ; 12
13 do 13 for acc in `cat $input.file_list` ;
14 do
15
14 #elif $input.input_select=="accession_number": 16 #elif $input.input_select=="accession_number":
15 acc="$input.accession" && 17
18 ## Stripping leading and trailing spaces in case user typed them in
19 acc="${input.accession}" &&
20
16 #end if 21 #end if
17 22
18 #if $input.input_select=="file_list" or $input.input_select=="accession_number": 23 #if $input.input_select=="file_list" or $input.input_select=="accession_number":
19 [ ""\$acc" =~ ^[E|S|D]RR[0-9]{1,}$" ] && ( 24
25 [ ""\$acc" =~ ^[E|S|D]RR[0-9]{1,}$" ] && (
26
20 #end if 27 #end if
21 28
22 ## Need to set the home directory to the current working directory, 29 ## Need to set the home directory to the current working directory,
23 ## else the tool tries to write to home/.ncbi and fails when used 30 ## else the tool tries to write to home/.ncbi and fails when used
24 ## with a cluster manager. 31 ## with a cluster manager.
72 --matepair-distance "$adv.matepairDist" 79 --matepair-distance "$adv.matepairDist"
73 #end if 80 #end if
74 $adv.clip 81 $adv.clip
75 $adv.skip_technical 82 $adv.skip_technical
76 83
77 #if str( $outputformat ) == "fasta": 84 #if str( $outputformat ) == "fastqsanger.gz":
78 --fasta 85 --gzip
86 #elif str( $outputformat ) == "fastqsanger.bz2":
87 --bzip2
79 #end if 88 #end if
80 #if $input.input_select=="file": 89 #if $input.input_select=="file":
81 --stdout 90 --stdout
82 "$input.file" > "$output_file" 91 "$input.file" > "$output_file"
83 #elif $input.input_select=="file_list": 92
84 "\$acc" 93 #elif $input.input_select=="accession_number":
85 #else: 94 --stdout
86 --stdout
87 "\$acc" > "$output_accession" ) 95 "\$acc" > "$output_accession" )
88 #end if 96 #end if
89 97
90 #if $input.input_select=="file_list": 98 #if $input.input_select=="file_list":
91 ) ; done 99 ) ; done
92 100
93 ; 101 ;
94 102
95 103 for i in `ls *.fast* | cut -f 1 -d '_' | uniq` ; do
96 104 count=`ls \$i* | wc -l` ;
97 105 data=(\$(ls -d \$i*));
98
99 for i in `ls *.fast* | cut -f 1 -d '_' | uniq` ; do
100 count=`ls \$i* | wc -l` ;
101 data=(\$(ls -d \$i*));
102 106
103 if [ "\$count" -eq 2 ]; then 107 if [ "\$count" -eq 2 ]; then
104 mv "\${data[0]}" "\${data[0]}"_forward.$outputformat; mv "\${data[1]}" "\${data[1]}"_reverse.$outputformat ; 108 mv "\${data[0]}" "\${data[0]}"_forward.$outputformat; mv "\${data[1]}" "\${data[1]}"_reverse.$outputformat ;
105 elif [ "\$count" -eq 1 ]; then 109 elif [ "\$count" -eq 1 ]; then
106 mv "\${data[0]}" "\${data[0]}"__single.$outputformat ; 110 mv "\${data[0]}" "\${data[0]}"__single.$outputformat ;
107 fi; 111 fi;
108 done 112 done
109 113
110 114
111 #end if 115 #end if
112 116
113 117
114 ]]> 118 ]]>
115 </command> 119 </command>
116 <inputs> 120 <inputs>
117 <expand macro="input_conditional"/> 121 <expand macro="input_conditional"/>
118 <param name="outputformat" type="select" label="select output format"> 122 <param name="outputformat" type="select" display="radio" label="Select output format" help="Compression will greatly reduce the amount of space occupied by downloaded data. Downstream applications such as a short-read mappers will accept compressed data as input. Consider this example: an uncoimpressed 400 Mb fastq datasets compresses to 100 Mb or 80 Mb by gzip or bzip2, respectively. " argument="--gzip --bzip2">
119 <option value="fastqsanger">fastq</option> 123 <option value="fastqsanger.gz">gzip compressed fastq</option>
120 <option value="fasta">fasta</option> 124 <option value="fastqsanger">Uncompressed fastq</option>
125 <option value="fastqsanger.bz2">bzip2 compressed fastq</option>
121 </param> 126 </param>
122 <section name="adv" title="Advanced Options" expanded="False"> 127 <section name="adv" title="Advanced Options" expanded="False">
123 <param name="minID" type="integer" label="minimum spot ID" optional="true"/> 128 <param name="minID" type="integer" label="Minimum spot ID" optional="true" help="Minimum spot id to be dumped." argument="--minSpotId"/>
124 <param name="maxID" type="integer" label="maximum spot ID" optional="true"/> 129 <param name="maxID" type="integer" label="Maximum spot ID" optional="true" help="Maximum spot id to be dumped." argument="--maxSpotId"/>
125 <param name="minlen" type="integer" label="minimum read length" optional="true"/> 130 <param name="minlen" type="integer" label="Minimum read length" optional="true" help="Filter by sequence length. Will dump only reads longer or equal to this value." argument="--minReadLen"/>
126 <param name="split" type="boolean" checked="true" truevalue="--split-spot" falsevalue=""> 131 <param name="split" type="boolean" checked="true" truevalue="--split-spot" falsevalue="" label="Split spot by read pairs" help="Split spots into individual reads." argument="--split-spot"/>
127 <label>split spot by read pairs</label>
128 </param>
129 <expand macro="alignments"/> 132 <expand macro="alignments"/>
130 <expand macro="region"/> 133 <expand macro="region"/>
131 <expand macro="matepairDist"/> 134 <expand macro="matepairDist"/>
132 <param name="readfilter" type="select" value=""> 135 <param name="readfilter" type="select" value="" label="filter by value" argument="--read-filter">
133 <label>filter by value</label>
134 <option value="">None</option> 136 <option value="">None</option>
135 <option value="pass">pass</option> 137 <option value="pass">pass</option>
136 <option value="reject">reject</option> 138 <option value="reject">reject</option>
137 <option value="criteria">criteria</option> 139 <option value="criteria">criteria</option>
138 <option value="redacted">redacted</option> 140 <option value="redacted">redacted</option>
139 </param> 141 </param>
140 <param name="spotgroups" type="text" label="filter by spot-groups" optional="true"/> 142 <param name="spotgroups" type="text" label="Filter by spot-groups" optional="true" argument="--spot-groups"/>
141 <param name="clip" type="boolean" truevalue="--clip" falsevalue=""> 143 <param name="clip" type="boolean" truevalue="--clip" falsevalue="" argument="--clip" label="Apply left and right clips" />
142 <label>apply left and right clips</label> 144 <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads" argument="--skip-technical"/>
143 </param>
144 <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads"/>
145 </section> 145 </section>
146 </inputs> 146 </inputs>
147 <outputs> 147 <outputs>
148 <collection name="list_paired" type="list:paired" label="Pair-end Fast(q|a)"> 148 <collection name="list_paired" type="list:paired" label="Pair-end data (fastq-dump)">
149 <filter>input['input_select'] == "file_list"</filter> 149 <filter>input['input_select'] == "file_list"</filter>
150
150 <!-- Use named regex group to grab pattern 151 <!-- Use named regex group to grab pattern
151 <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list 152 <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list
152 identifier in the nested collection and identifier_1 is either 153 identifier in the nested collection and identifier_1 is either
153 forward or reverse (for instance samp1_forward.fq). 154 forward or reverse (for instance samp1_forward.fq).
154 --> 155 -->
155 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq_(?P&lt;identifier_1&gt;[^_]+)\.fastq" ext="fastqsanger" visible="false" /> 156
156 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fasta_(?P&lt;identifier_1&gt;[^_]+)\.fasta" ext="fasta" visible="false" /> 157 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger" ext="fastqsanger" />
157 </collection> 158 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq.gz_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger.gz" ext="fastqsanger.gz" />
158 <collection name="output_collection" type='list' label="Single-end Fast(q|a)"> 159 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq.bz2_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger.bz2" ext="fastqsanger.bz2" />
159 <filter>input['input_select'] == "file_list"</filter> 160 </collection>
160 <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq__single\.fastq" directory="." ext='fastqsanger'/> 161 <collection name="output_collection" type='list' label="Single-end data (fastq-dump)">
161 <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fasta__single\.fasta" directory="." ext='fasta'/> 162 <filter>input['input_select'] == "file_list"</filter>
162 </collection> 163 <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq__single\.fastqsanger" directory="." ext='fastqsanger'/>
163 <data format="fastqsanger" name="output_accession" > 164 <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq.gz__single\.fastqsanger.gz" directory="." ext='fastqsanger.gz'/>
164 <filter>input['input_select'] == "accession_number"</filter> 165 <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq.bz2__single\.fastqsanger.bz2" directory="." ext='fastqsanger.bz2'/>
165 <change_format> 166 </collection>
166 <when input="outputformat" value="fasta" format="fasta"/> 167 <data format="fastqsanger" name="output_accession" label="${input.accession} (fastq-dump)">
167 </change_format> 168 <filter>input['input_select'] == "accession_number"</filter>
168 </data> 169 <change_format>
169 <data format="fastqsanger" name="output_file" label="${input.file.name}.${outputformat}"> 170 <when input="outputformat" value="fastqsanger.gz" format="fastqsanger.gz"/>
170 <filter>input['input_select'] == "file"</filter> 171 <when input="outputformat" value="fastqsanger.bz2" format="fastqsanger.bz2"/>
171 <change_format> 172 </change_format>
172 <when input="outputformat" value="fasta" format="fasta"/> 173 </data>
173 </change_format> 174 <data format="fastqsanger" name="output_file" label="${input.file.name} (fastq-dump)">
174 </data> 175 <filter>input['input_select'] == "file"</filter>
176 <change_format>
177 <when input="outputformat" value="fastqsanger.gz" format="fastqsanger.gz"/>
178 <when input="outputformat" value="fastqsanger.bz2" format="fastqsanger.bz2"/>
179 </change_format>
180 </data>
175 </outputs> 181 </outputs>
176 <tests> 182 <tests>
177 <test> 183 <test>
178 <param name="input_select" value="accession_number"/> 184 <param name="input_select" value="accession_number"/>
179 <param name="outputformat" value="fastqsanger"/> 185 <param name="outputformat" value="fastqsanger"/>
180 <param name="accession" value="SRR044777"/> 186 <param name="accession" value="SRR044777"/>
181 <param name="skip_technical" value="True"/> 187 <param name="skip_technical" value="True"/>
182 <output name="output_accession"> 188 <output name="output_accession">
183 <assert_contents> 189 <assert_contents>
184 <not_has_text text="rRNA_primer"/> 190 <not_has_text text="rRNA_primer"/>
185 <has_text text="F47USSH02GNP1D" /> 191 <has_text text="F47USSH02GNP1D" />
186 </assert_contents> 192 </assert_contents>
187 </output> 193 </output>
188 </test> 194 </test>
189 <test> 195 <test>
190 <param name="input_select" value="accession_number"/> 196 <param name="input_select" value="accession_number"/>
191 <param name="outputformat" value="fastqsanger"/> 197 <param name="outputformat" value="fastqsanger.gz"/>
192 <param name="accession" value="SRR925743"/> 198 <param name="accession" value="SRR925743"/>
193 <param name="maxID" value="5"/> 199 <param name="maxID" value="5"/>
194 <output name="output_accession" file="fastq_dump_result.fastq" ftype="fastqsanger"/> 200 <output name="output_accession" file="fastq_dump_result.fastq.gz" decompress="True"/>
195 </test> 201 </test>
196 <test> 202 <test>
197 <param name="input_select" value="file_list"/> 203 <param name="input_select" value="accession_number"/>
198 <param name="outputformat" value="fastqsanger"/> 204 <param name="outputformat" value="fastqsanger"/>
199 <param name="file_list" value="list_pe"/> 205 <param name="accession" value="SRR925743"/>
200 <param name="maxID" value="5"/> 206 <param name="maxID" value="5"/>
201 <output_collection name="list_paired" type="list:paired"> 207 <output name="output_accession" file="fastq_dump_result.fastq" ftype="fastqsanger"/>
202 <element name="DRR015708"> 208 </test>
203 <element name="forward" file="DRR015708_forward.fastqsanger"> 209 <test>
204 </element> 210 <param name="input_select" value="file_list"/>
205 <element name="reverse" file="DRR015708_reverse.fastqsanger"> 211 <param name="outputformat" value="fastqsanger"/>
206 </element> 212 <param name="file_list" value="list_pe"/>
207 </element> 213 <param name="maxID" value="5"/>
208 </output_collection> 214 <output_collection name="list_paired" type="list:paired">
209 </test> 215 <element name="DRR015708">
210 <test> 216 <element name="forward" file="DRR015708_forward.fastqsanger">
211 <param name="input_select" value="file_list"/> 217 </element>
212 <param name="outputformat" value="fastqsanger"/> 218 <element name="reverse" file="DRR015708_reverse.fastqsanger">
213 <param name="file_list" value="list_pe2"/> 219 </element>
214 <param name="maxID" value="5"/> 220 </element>
215 <output_collection name="list_paired" type="list:paired"> 221 </output_collection>
216 <element name="ERR027433"> 222 </test>
217 <element name="forward" file="ERR027433_forward.fastqsanger"> 223 <test>
218 </element> 224 <param name="input_select" value="file_list"/>
219 <element name="reverse" file="ERR027433_reverse.fastqsanger"> 225 <param name="outputformat" value="fastqsanger"/>
220 </element> 226 <param name="file_list" value="list_pe2"/>
221 </element> 227 <param name="maxID" value="5"/>
222 </output_collection> 228 <output_collection name="list_paired" type="list:paired">
223 </test> 229 <element name="ERR027433">
224 <test> 230 <element name="forward" file="ERR027433_forward.fastqsanger">
225 <param name="input_select" value="file_list"/> 231 </element>
226 <param name="outputformat" value="fastqsanger"/> 232 <element name="reverse" file="ERR027433_reverse.fastqsanger">
227 <param name="file_list" value="list_se"/> 233 </element>
228 <param name="maxID" value="5"/> 234 </element>
229 <output_collection name="output_collection" type="list"> 235 </output_collection>
230 <element name="SRR1993644" file="SRR1993644.fastqsanger"/> 236 </test>
231 </output_collection> 237 <test>
232 </test> 238 <param name="input_select" value="file_list"/>
239 <param name="outputformat" value="fastqsanger"/>
240 <param name="file_list" value="list_se"/>
241 <param name="maxID" value="5"/>
242 <output_collection name="output_collection" type="list">
243 <element name="SRR1993644" file="SRR1993644.fastqsanger"/>
244 </output_collection>
245 </test>
233 </tests> 246 </tests>
234 <help> 247 <help><![CDATA[
235 This tool extracts reads from SRA archives using fastq-dump. 248 **What it does?**
236 The fastq-dump program is developed at NCBI, and is available at 249
237 http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software. 250 This tool extracts data (in fastq_ format) from the Short Read Archive (SRA) at the National Center for Biotechnology Information (NCBI). It is based on the fastq-dump_ utility of the SRA Toolkit.
238 251
239 NB: Single-end or pair-end collections may be empty if given SRRs LibraryLayout contains only either SINGLE or PAIRED respectively 252 **How to use it?**
240 @SRATOOLS_ATTRRIBUTION@ 253
254 There are three ways in which you can download data:
255
256 1. Data for single accession
257 2. Multiple datasets using a list of accessions
258 3. Extract data from already uploaded SRA dataset
259
260 Below we discuss each in detail.
261
262 ------
263
264 **Uploading data for a single accession**
265
266 When you type a single accession number (e.g., `SRR1582967`) into **Accession** box and click **Execute** the tool will fetch data for you. It is important to keep the following in mind:
267
268 - if data is paired-ended (or mate-paired) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see an example dataset below)
269 - if data is single ended, a standard single fastq dataset will be produced
270
271 -----
272
273 **Uploading multiple datasets using a list of accessions**
274
275 A more realistic scenario is when you want to upload a number of datasets at once. To do this you need a list of accession, where there is only one accession per line (see below for information on how to generate such a file). Once you have this file:
276
277 1. Upload it into your history using Galaxy's upload tool
278 2. Once the list of accessions is uploaded choose *List of SRA accessions, one per line* from **select input type** dropdown
279 3. Choose uploaded file within the **sra accession list** field
280 4. Click **Execute**
281
282 .. class:: warningmark
283
284 Fastq datasets produced by this option will be saved in Galaxy's history as a collection_ - a single history element containing multiple datasets. In fact, two collections will be produced: one containing paired-end data and another containing single-end data. Single-end or pair-end collections may be empty if the accessions provided in the list contain only SINGLE or PAIRED data, respectively.
285
286 -----
287
288 **Extract data from already uploaded SRA dataset**
289
290 If a SRA dataset is present in the history, it can be converted into fastq dataset by setting **select input type** drop-down to *SRA archive in current history*. Just like in the case of extracting data for single accession number the following applies:
291
292 - if data is paired-ended (or mate-pair) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see example below).
293 - if data is single ended, a standard fastq dataset will be produced
294
295 @ACCESSION_LIST_HOWTO@
296
297 -----
298
299 **Paired-end (and mate-pair) data in fastq format**
300
301 Paired end datasets can be represented as two individual datasets:
302
303 First dataset::
304
305 @1/1
306 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA
307 +
308 EGGEGGGDFGEEEAEECGDEGGFEEGEFGBEEDDECFEFDD@CDD<ED
309 @2/1
310 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA
311 +
312 HHHHHHEGFHEEFEEHEEHHGGEGGGGEFGFGGGGHHHHFBEEEEEFG
313
314 Second dataset::
315
316 @1/2
317 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
318 +
319 GHHHDFDFGFGEGFBGEGGEGEGGGHGFGHFHFHHHHHHHEF?EFEFF
320 @2/2
321 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
322 +
323 HHHHHHHHHHHHHGHHHHHHGHHHHHHHHHHHFHHHFHHHHHHHHHHH
324
325 Or a single *interleaved* dataset::
326
327 @1/1
328 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA
329 +
330 EGGEGGGDFGEEEAEECGDEGGFEEGEFGBEEDDECFEFDD@CDD<ED
331 @1/2
332 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
333 +
334 GHHHDFDFGFGEGFBGEGGEGEGGGHGFGHFHFHHHHHHHEF?EFEFF
335 @2/1
336 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA
337 +
338 HHHHHHEGFHEEFEEHEEHHGGEGGGGEFGFGGGGHHHHFBEEEEEFG
339 @2/2
340 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
341 +
342 HHHHHHHHHHHHHGHHHHHHGHHHHHHHHHHHFHHHFHHHHHHHHHHH
343
344 ----
345
346
347 .. _fastq: https://en.wikipedia.org/wiki/FASTQ_format
348 .. _fastq-dump: https://ncbi.github.io/sra-tools/fastq-dump.html
349 .. _collection: https://galaxyproject.org/tutorials/collections/
350 .. _link: http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=studies
351
352 @SRATOOLS_ATTRRIBUTION@
353
354 ]]>
241 </help> 355 </help>
242 <expand macro="citation"/> 356 <expand macro="citation"/>
243 </tool> 357 </tool>