comparison fastq_dump.xml @ 27:9a776b080193 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sra-tools commit cbb1499906c801443d72bdf313d86f0182aca010
author iuc
date Sun, 22 Jan 2023 17:51:50 +0000
parents 83c7d564b128
children 4317d3cb6cba
comparison
equal deleted inserted replaced
26:83c7d564b128 27:9a776b080193
1 <tool id="fastq_dump" name="Download and Extract Reads in FASTA/Q" version="@VERSION@+galaxy0" profile="18.01"> 1 <tool id="fastq_dump" name="Download and Extract Reads in FASTQ" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>format from NCBI SRA</description> 2 <description>format from NCBI SRA</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="edam_ontology"/>
3 <expand macro="bio_tools"/> 7 <expand macro="bio_tools"/>
4 <macros>
5 <import>sra_macros.xml</import>
6 </macros>
7 <expand macro="requirements"/> 8 <expand macro="requirements"/>
8 <version_command>fastq-dump --version</version_command> 9 <version_command>fastq-dump --version | tr -d $'\n'</version_command>
9 <command detect_errors="exit_code"><![CDATA[ 10 <command detect_errors="exit_code"><![CDATA[
10 @COPY_CONFIGFILE@ 11 @COPY_CONFIGFILE@
11 @SET_ACCESSIONS@ 12 @SET_ACCESSIONS@
12 13
13 #if $input.input_select == "file": 14 #if $input.input_select == "sra_file":
14 fastq-dump --log-level fatal --accession '${input.file.name}' 15 fastq-dump --log-level fatal --accession '${input.file.name}'
15 #else: 16 #else:
16 ## Do not use prefetch if region is specified, to avoid downloading 17 ## Do not use prefetch if region is specified, to avoid downloading
17 ## the complete sra file. 18 ## the complete sra file.
18 #if ( str( $adv.region ) == "" ) and ( str( $adv.minID ) == "" ) and ( str( $adv.maxID ) == "" ): 19 #if ( str( $adv.region ) == "" ) and ( str( $adv.minID ) == "" ) and ( str( $adv.maxID ) == "" ):
62 #end if 63 #end if
63 64
64 #if str($adv.table) != "": 65 #if str($adv.table) != "":
65 --table $adv.table 66 --table $adv.table
66 #end if 67 #end if
67 68 ;
68
69 #if $input.input_select=="file":
70 --stdout
71 "$input.file" > "$output_file"
72 69
73 #elif $input.input_select=="accession_number": 70 mkdir -p output &&
74 --stdout 71 data=(\$(ls ./*.fast*));
75 "\$acc" > "$output_accession" ) 72 if [ \${\#data[@]} -eq 2 ]; then
76 #end if 73 mv "\${data[0]}" output/"\${data[0]}"_forward.$outputformat;
77 74 mv "\${data[1]}" output/"\${data[1]}"_reverse.$outputformat;
78 #if $input.input_select=="file_list": 75 elif [ \${\#data[@]} -eq 1 ]; then
79 "\$acc" 76 mv "\${data[0]}" output/"\${data[0]}"__single.$outputformat;
80 ) ; done 77 fi;
81 78
82 ; 79 #if $input.input_select != "sra_file":
83 80 ); done;
84 for i in `ls *.fast* | cut -f 1 -d '_' | uniq` ; do 81 #end if
85 count=`ls \$i* | wc -l` ; 82 echo "Done with all accessions."
86 data=(\$(ls -d \$i*));
87
88 if [ "\$count" -eq 2 ]; then
89 mv "\${data[0]}" "\${data[0]}"_forward.$outputformat; mv "\${data[1]}" "\${data[1]}"_reverse.$outputformat ;
90 elif [ "\$count" -eq 1 ]; then
91 mv "\${data[0]}" "\${data[0]}"__single.$outputformat ;
92 fi;
93 done
94
95
96 #end if
97
98
99 ]]> 83 ]]>
100 </command> 84 </command>
101 <expand macro="configfile_hack"/> 85 <expand macro="configfile_hack"/>
102 <inputs> 86 <inputs>
103 <expand macro="input_conditional"/> 87 <expand macro="input_conditional"/>
120 <option value="reject">reject</option> 104 <option value="reject">reject</option>
121 <option value="criteria">criteria</option> 105 <option value="criteria">criteria</option>
122 <option value="redacted">redacted</option> 106 <option value="redacted">redacted</option>
123 </param> 107 </param>
124 <param name="spotgroups" type="text" label="Filter by spot-groups" optional="true" argument="--spot-groups"/> 108 <param name="spotgroups" type="text" label="Filter by spot-groups" optional="true" argument="--spot-groups"/>
125 <param name="clip" type="boolean" truevalue="--clip" falsevalue="" argument="--clip" label="Apply left and right clips" /> 109 <param type="boolean" truevalue="--clip" falsevalue="" argument="--clip" label="Apply left and right clips" />
126 <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads" argument="--skip-technical"/> 110 <param type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads" argument="--skip-technical"/>
127 <param name="table" label="Table name within cSRA object" type="text" value="" optional="true" help="For SRA of noisy long-reads put SEQUENCE" argument="--table"/> 111 <param label="Table name within cSRA object" type="text" value="" optional="true" help="For SRA of noisy long-reads put SEQUENCE" argument="--table"/>
128 </section> 112 </section>
129 </inputs> 113 </inputs>
130 <outputs> 114 <outputs>
131 <collection name="list_paired" type="list:paired" label="Pair-end data (fastq-dump)"> 115 <collection name="list_paired" type="list:paired" label="Paired-end data (fastq-dump)">
132 <filter>input['input_select'] == "file_list"</filter>
133
134 <!-- Use named regex group to grab pattern 116 <!-- Use named regex group to grab pattern
135 <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list 117 <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list
136 identifier in the nested collection and identifier_1 is either 118 identifier in the nested collection and identifier_1 is either
137 forward or reverse (for instance samp1_forward.fq). 119 forward or reverse (for instance samp1_forward.fq).
138 --> 120 -->
139 121 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger" ext="fastqsanger" directory="output"/>
140 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger" ext="fastqsanger" /> 122 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq.gz_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger.gz" ext="fastqsanger.gz" directory="output"/>
141 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq.gz_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger.gz" ext="fastqsanger.gz" /> 123 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq.bz2_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger.bz2" ext="fastqsanger.bz2" directory="output"/>
142 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq.bz2_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger.bz2" ext="fastqsanger.bz2" />
143 </collection> 124 </collection>
144 <collection name="output_collection" type='list' label="Single-end data (fastq-dump)"> 125 <collection name="list_single" type='list' label="Single-end data (fastq-dump)">
145 <filter>input['input_select'] == "file_list"</filter> 126 <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq__single\.fastqsanger" directory="output" ext='fastqsanger'/>
146 <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq__single\.fastqsanger" directory="." ext='fastqsanger'/> 127 <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq.gz__single\.fastqsanger.gz" directory="output" ext='fastqsanger.gz'/>
147 <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq.gz__single\.fastqsanger.gz" directory="." ext='fastqsanger.gz'/> 128 <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq.bz2__single\.fastqsanger.bz2" directory="output" ext='fastqsanger.bz2'/>
148 <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq.bz2__single\.fastqsanger.bz2" directory="." ext='fastqsanger.bz2'/>
149 </collection> 129 </collection>
150 <data format="fastqsanger" name="output_accession" label="${input.accession} (fastq-dump)">
151 <filter>input['input_select'] == "accession_number"</filter>
152 <change_format>
153 <when input="outputformat" value="fastqsanger.gz" format="fastqsanger.gz"/>
154 <when input="outputformat" value="fastqsanger.bz2" format="fastqsanger.bz2"/>
155 </change_format>
156 </data>
157 <data format="fastqsanger" name="output_file" label="${input.file.name} (fastq-dump)">
158 <filter>input['input_select'] == "file"</filter>
159 <change_format>
160 <when input="outputformat" value="fastqsanger.gz" format="fastqsanger.gz"/>
161 <when input="outputformat" value="fastqsanger.bz2" format="fastqsanger.bz2"/>
162 </change_format>
163 </data>
164 </outputs> 130 </outputs>
165 <tests> 131 <tests>
166 <test> 132 <test expect_num_outputs="2">
167 <param name="input_select" value="accession_number"/> 133 <param name="input_select" value="accession_number"/>
168 <param name="outputformat" value="fastqsanger"/> 134 <param name="outputformat" value="fastqsanger"/>
169 <param name="accession" value="SRR044777"/> 135 <param name="accession" value="SRR044777"/>
170 <param name="skip_technical" value="True"/> 136 <param name="skip_technical" value="True"/>
171 <output name="output_accession"> 137 <output_collection name="list_single" type="list" count="1">
172 <assert_contents> 138 <element name="SRR044777">
173 <not_has_text text="rRNA_primer"/> 139 <assert_contents>
174 <has_text text="F47USSH02GNP1D" /> 140 <not_has_text text="rRNA_primer"/>
175 </assert_contents> 141 <has_text text="F47USSH02GNP1D"/>
176 </output> 142 </assert_contents>
177 </test> 143 </element>
178 <test> 144 </output_collection>
145 </test>
146 <test expect_num_outputs="2">
179 <param name="input_select" value="accession_number"/> 147 <param name="input_select" value="accession_number"/>
180 <param name="outputformat" value="fastqsanger.gz"/> 148 <param name="outputformat" value="fastqsanger.gz"/>
181 <param name="accession" value="SRR925743"/> 149 <param name="accession" value="SRR925743"/>
182 <param name="maxID" value="5"/> 150 <param name="maxID" value="5"/>
183 <output name="output_accession" file="fastq_dump_result.fastq.gz" decompress="True"/> 151 <output_collection name="list_paired" type="list:paired" count="1">
184 </test> 152 <element name="SRR925743">
185 <test> 153 <element name="forward" file="SRR925743_forward.fastqsanger" decompress="True"/>
154 <element name="reverse" file="SRR925743_reverse.fastqsanger" decompress="True"/>
155 </element>
156 </output_collection>
157 </test>
158 <test expect_num_outputs="2">
186 <param name="input_select" value="accession_number"/> 159 <param name="input_select" value="accession_number"/>
187 <param name="outputformat" value="fastqsanger"/> 160 <param name="outputformat" value="fastqsanger"/>
188 <param name="accession" value="SRR925743"/> 161 <param name="accession" value="SRR925743"/>
189 <param name="maxID" value="5"/> 162 <param name="maxID" value="5"/>
190 <output name="output_accession" file="fastq_dump_result.fastq" ftype="fastqsanger"/> 163 <output_collection name="list_paired" type="list:paired" count="1">
191 </test> 164 <element name="SRR925743">
192 <test> 165 <element name="forward" file="SRR925743_forward.fastqsanger"/>
166 <element name="reverse" file="SRR925743_reverse.fastqsanger"/>
167 </element>
168 </output_collection>
169 </test>
170 <test expect_num_outputs="2">
193 <param name="input_select" value="file_list"/> 171 <param name="input_select" value="file_list"/>
194 <param name="outputformat" value="fastqsanger"/> 172 <param name="outputformat" value="fastqsanger"/>
195 <param name="file_list" value="list_pe"/> 173 <param name="file_list" value="list_pe"/>
196 <param name="maxID" value="5"/> 174 <param name="maxID" value="5"/>
197 <output_collection name="list_paired" type="list:paired"> 175 <output_collection name="list_paired" type="list:paired" count="1">
198 <element name="DRR015708"> 176 <element name="DRR015708">
199 <element name="forward" file="DRR015708_forward.fastqsanger"> 177 <element name="forward" file="DRR015708_forward.fastqsanger"/>
200 </element> 178 <element name="reverse" file="DRR015708_reverse.fastqsanger"/>
201 <element name="reverse" file="DRR015708_reverse.fastqsanger"> 179 </element>
202 </element> 180 </output_collection>
203 </element> 181 </test>
204 </output_collection> 182 <test expect_num_outputs="2">
205 </test>
206 <test>
207 <param name="input_select" value="file_list"/> 183 <param name="input_select" value="file_list"/>
208 <param name="outputformat" value="fastqsanger"/> 184 <param name="outputformat" value="fastqsanger"/>
209 <param name="file_list" value="list_pe2"/> 185 <param name="file_list" value="list_pe2"/>
210 <param name="maxID" value="5"/> 186 <param name="maxID" value="5"/>
211 <output_collection name="list_paired" type="list:paired"> 187 <output_collection name="list_paired" type="list:paired" count="1">
212 <element name="ERR027433"> 188 <element name="ERR027433">
213 <element name="forward" file="ERR027433_forward.fastqsanger"> 189 <element name="forward" file="ERR027433_forward.fastqsanger"/>
214 </element> 190 <element name="reverse" file="ERR027433_reverse.fastqsanger"/>
215 <element name="reverse" file="ERR027433_reverse.fastqsanger">
216 </element>
217 </element> 191 </element>
218 </output_collection> 192 </output_collection>
219 </test> 193 </test>
220 <test> 194 <test expect_num_outputs="2">
221 <param name="input_select" value="file_list"/> 195 <param name="input_select" value="file_list"/>
222 <param name="outputformat" value="fastqsanger"/> 196 <param name="outputformat" value="fastqsanger"/>
223 <param name="file_list" value="list_se"/> 197 <param name="file_list" value="list_se"/>
224 <param name="maxID" value="5"/> 198 <param name="maxID" value="5"/>
225 <output_collection name="output_collection" type="list"> 199 <output_collection name="list_single" type="list" count="1">
226 <element name="SRR1993644" file="SRR1993644.fastqsanger"/> 200 <element name="SRR1993644" file="SRR1993644.fastqsanger"/>
227 </output_collection> 201 </output_collection>
228 </test> 202 </test>
229 <test> 203 <test expect_num_outputs="2">
230 <param name="input_select" value="accession_number"/> 204 <param name="input_select" value="accession_number"/>
231 <param name="outputformat" value="fastqsanger.gz"/> 205 <param name="outputformat" value="fastqsanger.gz"/>
232 <param name="accession" value="SRR6982805"/> 206 <param name="accession" value="SRR6982805"/>
233 <param name="maxID" value="2"/> 207 <param name="maxID" value="2"/>
234 <param name="table" value="SEQUENCE"/> 208 <param name="table" value="SEQUENCE"/>
235 <output name="output_accession" file="SRR6982805.fastqsanger.gz" ftype="fastqsanger.gz" decompress="True"/> 209 <output_collection name="list_single" type="list" count="1">
236 </test> 210 <element name="SRR6982805" file="SRR6982805.fastqsanger.gz" ftype="fastqsanger.gz" decompress="True"/>
211 </output_collection>
212 </test>
213 <test expect_num_outputs="2">
214 <param name="input_select" value="accession_number"/>
215 <param name="outputformat" value="fastqsanger.gz"/>
216 <param name="accession" value="ERR086330, SRR11953971"/>
217 <output_collection name="list_paired" type="list:paired" count="2">
218 <element name="ERR086330">
219 <element name="forward" file="ERR086330_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
220 <element name="reverse" file="ERR086330_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
221 </element>
222 <element name="SRR11953971">
223 <element name="forward" file="SRR11953971_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
224 <element name="reverse" file="SRR11953971_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
225 </element>
226 </output_collection>
227 </test>
237 </tests> 228 </tests>
238 <help><![CDATA[ 229 <help><![CDATA[
239 **What it does?** 230 **What it does?**
240 231
241 This tool extracts data (in fastq_ format) from the Short Read Archive (SRA) at the National Center for Biotechnology Information (NCBI). It is based on the fastq-dump_ utility of the SRA Toolkit. 232 This tool extracts data (in fastq_ format) from the Short Read Archive (SRA) at the National Center for Biotechnology Information (NCBI). It is based on the fasterq-dump_ utility of the SRA Toolkit. The following applies:
242 233
243 **How to use it?** 234 - if data is paired-ended (or mate-pair) the tool will generate a collection of file pairs, in which each element will be a pair of fastq_ files containing forward and reverse mates.
244 235 - if data is single ended, each element of the collection will be a single fastq_ dataset.
245 There are three ways in which you can download data: 236
246 237
247 1. Data for single accession 238 @HOW_TO_USE_IT@
248 2. Multiple datasets using a list of accessions
249 3. Extract data from already uploaded SRA dataset
250
251 Below we discuss each in detail.
252
253 ------
254
255 **Uploading data for a single accession**
256
257 When you type a single accession number (e.g., `SRR1582967`) into **Accession** box and click **Execute** the tool will fetch data for you. It is important to keep the following in mind:
258
259 - if data is paired-ended (or mate-paired) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see an example dataset below)
260 - if data is single ended, a standard single fastq dataset will be produced
261 239
262 ----- 240 -----
263 241
264 **Uploading multiple datasets using a list of accessions** 242 **Output**
265 243
266 A more realistic scenario is when you want to upload a number of datasets at once. To do this you need a list of accession, where there is only one accession per line (see below for information on how to generate such a file). Once you have this file: 244 In every case, fastq datasets produced will be saved in Galaxy's history as a collection_ - a single history element containing multiple datasets. In fact, regardless of the experimental design, three collections will be produced: one containing paired-end data, another containing single-end data, and a third one which contains reads which could not be classified.
267 245 Some collections may be empty if the accessions provided in the list do not contain one of the type of data.
268 1. Upload it into your history using Galaxy's upload tool
269 2. Once the list of accessions is uploaded choose *List of SRA accessions, one per line* from **select input type** dropdown
270 3. Choose uploaded file within the **sra accession list** field
271 4. Click **Execute**
272 246
273 .. class:: warningmark 247 .. class:: warningmark
274 248
275 Fastq datasets produced by this option will be saved in Galaxy's history as a collection_ - a single history element containing multiple datasets. In fact, two collections will be produced: one containing paired-end data and another containing single-end data. Single-end or pair-end collections may be empty if the accessions provided in the list contain only SINGLE or PAIRED data, respectively. 249 When you decide to dump technical reads (in Advanced Options Dump only biological reads is set to No), you will probably find your PAIRED data in the other data collection as it is impossible to determine if it was 2 biological reads or one biological and one technical.
250
251 .. class:: warningmark
252
253 By default, only biological reads are dumped and in case of PAIRED dataset only the spots which have both reads will be in the paired-end collection. The remaining single reads will be in the other colletion.
254 To keep all reads, and potentially not have the same number of reads in forward and reverse use the --split-files option in Advanced Options, Select how to split the spots.
255
256 @ACCESSION_LIST_HOWTO@
276 257
277 ----- 258 -----
278 259
279 **Extract data from already uploaded SRA dataset**
280
281 If a SRA dataset is present in the history, it can be converted into fastq dataset by setting **select input type** drop-down to *SRA archive in current history*. Just like in the case of extracting data for single accession number the following applies:
282
283 - if data is paired-ended (or mate-pair) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see example below).
284 - if data is single ended, a standard fastq dataset will be produced
285
286 @ACCESSION_LIST_HOWTO@
287
288 -----
289
290 **Paired-end (and mate-pair) data in fastq format**
291
292 Paired end datasets can be represented as two individual datasets:
293
294 First dataset::
295
296 @1/1
297 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA
298 +
299 EGGEGGGDFGEEEAEECGDEGGFEEGEFGBEEDDECFEFDD@CDD<ED
300 @2/1
301 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA
302 +
303 HHHHHHEGFHEEFEEHEEHHGGEGGGGEFGFGGGGHHHHFBEEEEEFG
304
305 Second dataset::
306
307 @1/2
308 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
309 +
310 GHHHDFDFGFGEGFBGEGGEGEGGGHGFGHFHFHHHHHHHEF?EFEFF
311 @2/2
312 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
313 +
314 HHHHHHHHHHHHHGHHHHHHGHHHHHHHHHHHFHHHFHHHHHHHHHHH
315
316 Or a single *interleaved* dataset::
317
318 @1/1
319 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA
320 +
321 EGGEGGGDFGEEEAEECGDEGGFEEGEFGBEEDDECFEFDD@CDD<ED
322 @1/2
323 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
324 +
325 GHHHDFDFGFGEGFBGEGGEGEGGGHGFGHFHFHHHHHHHEF?EFEFF
326 @2/1
327 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA
328 +
329 HHHHHHEGFHEEFEEHEEHHGGEGGGGEFGFGGGGHHHHFBEEEEEFG
330 @2/2
331 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
332 +
333 HHHHHHHHHHHHHGHHHHHHGHHHHHHHHHHHFHHHFHHHHHHHHHHH
334
335 ----
336
337 260
338 .. _fastq: https://en.wikipedia.org/wiki/FASTQ_format 261 .. _fastq: https://en.wikipedia.org/wiki/FASTQ_format
339 .. _fastq-dump: https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=toolkit_doc&f=fastq-dump 262 .. _fasterq-dump: https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump
340 .. _collection: https://galaxyproject.org/tutorials/collections/ 263 .. _collection: https://galaxyproject.org/tutorials/collections/
341 .. _link: https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=studies 264 .. _link: https://trace.ncbi.nlm.nih.gov/Traces/index.html?view=run_browser&display=reads
342 265
343 @SRATOOLS_ATTRRIBUTION@ 266 @SRATOOLS_ATTRRIBUTION@
344
345 ]]> 267 ]]>
346 </help> 268 </help>
347 <expand macro="citation"/> 269 <expand macro="citation"/>
348 </tool> 270 </tool>