comparison datasets_gene.xml @ 11:ac24fff14f23 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 4d7d3a56084e140f4fa63fb0e04a08b732f247f2
author iuc
date Fri, 02 Dec 2022 10:52:48 +0000
parents
children d78faac2c6ef
comparison
equal deleted inserted replaced
10:a3395b1d871b 11:ac24fff14f23
1 <tool id="datasets_download_gene" name="NCBI Datasets Gene" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
2 <description>download gene sequences and metadata</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements"></expand>
7 <command><![CDATA[
8 #import re
9 @SETUP_CERTIFICATES@
10 datasets download gene $query.subcommand.download_by
11 #if $query.subcommand.download_by == 'taxon':
12 '$query.subcommand.taxon_positional'
13 #else:
14 #if $query.subcommand.text_or_file.text_or_file == 'text':
15 #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)
16 #else
17 --inputfile '$query.subcommand.text_or_file.inputfile'
18 #end if
19 #end if
20
21 #if $query.subcommand.download_by != 'taxon' and $query.subcommand.ortholog:
22 --ortholog '$query.subcommand.ortholog'
23 #end if
24
25 #if $query.subcommand.download_by == 'symbol':
26 #if $query.subcommand.taxon
27 --taxon '$query.subcommand.taxon'
28 #end if
29 #end if
30
31 #if $query.subcommand.download_by == 'accession':
32 #if $query.subcommand.taxon_filter
33 --taxon-filter '$query.subcommand.taxon_filter'
34 #end if
35 #if str($query.subcommand.include_flanks_bp)
36 --include-flanks-bp $query.subcommand.include_flanks_bp
37 #end if
38 #end if
39
40 #if $filters.fasta_filter_cond.fasta_filter_select
41 #if $filters.fasta_filter_cond.fasta_filter_select == 'text'
42 --fasta-filter #echo ",".join(f"'{x}'" for x in $filters.fasta_filter_cond.fasta_filter.split(',') if x)
43 #else
44 --fasta-filter-file '$filters.fasta_filter_cond.fasta_filter_file'
45 #end if
46 #end if
47
48 --include
49 #if $file_choices.kingdom_cond.include
50 #echo ",".join($file_choices.kingdom_cond.include)
51 #else
52 none
53 #end if
54
55 --no-progressbar
56
57 ## produce TSV report file (either gene or prok-gene)
58 &&
59 dataformat
60 tsv
61 $file_choices.kingdom_cond.kingdom_sel
62 --package ncbi_dataset.zip
63 --fields #echo ",".join($file_choices.kingdom_cond.report_columns)
64 > gene_data_report.tsv
65 ## if ! dataformat tsv gene --package ncbi_dataset.zip > gene_data_report.tsv 2> dataformat.log; then
66 ## dataformat tsv prok-gene --package ncbi_dataset.zip > gene_data_report.tsv 2>> dataformat.log;
67 ## fi
68
69 #if $file_choices.kingdom_cond.include and "product-report" in $file_choices.kingdom_cond.include
70 && dataformat tsv gene-product --package ncbi_dataset.zip > gene_product_report.tsv
71 #end if
72
73 ## unzip and rehydrate if any data is to be downloaded (include is not None)
74 #if $file_choices.kingdom_cond.include
75 ## unzip
76 && 7z x -y ncbi_dataset.zip > 7z.log
77 #end if
78 ]]></command>
79 <inputs>
80 <section name="query" title="Query" expanded="true">
81 <conditional name="subcommand">
82 <param name="download_by" type="select" label="Choose how to find genes to download">
83 <option value="gene-id">By NCBI Gene ID</option>
84 <option value="symbol">By Gene symbol</option>
85 <option value="accession">By RefSeq nucleotide or protein accession</option>
86 <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option>
87 </param>
88 <when value="gene-id">
89 <expand macro="text_or_file" what="Gene ID" what_extended="NCBI Gene ID" help=""/>
90 <expand macro="ortholog"/>
91 </when>
92 <when value="symbol">
93 <expand macro="text_or_file" what="Gene Symbol" what_extended="NCBI Gene Symbol" help=""/>
94 <expand macro="ortholog"/>
95 <param argument="--taxon" type="text" value="human" label="Species for gene symbol" help="NCBI taxid, common or scientific name">
96 <sanitizer invalid_char="">
97 <valid initial="string.letters">
98 <add value=" " />
99 <add value="-" />
100 </valid>
101 </sanitizer>
102 </param>
103 </when>
104 <when value="accession">
105 <expand macro="text_or_file" what="Gene Accession" what_extended="NCBI Gene Accession" help=""/>
106 <expand macro="ortholog"/>
107 <param argument="--taxon-filter" type="text" value="" label="Limit gene sequences and annotation report file to specified taxon" help="any rank, only available for WP accessions">
108 <sanitizer invalid_char="">
109 <valid initial="string.letters">
110 <add value=" " />
111 <add value="-" />
112 </valid>
113 </sanitizer>
114 </param>
115 <param argument="--include-flanks-bp" type="integer" optional="true" min="0" label="Length of flanking nucleotides" help="WP accessions only"/>
116 </when>
117 <when value="taxon">
118 <expand macro="taxon_positional"/>
119 </when>
120 </conditional>
121 </section>
122 <section name="filters" title="Filters and Limit">
123 <conditional name="fasta_filter_cond" label="Filter protein and RNA sequences by RefSeq nucleotide and protein accessions">
124 <param name="fasta_filter_select" type="select" label="Apply filter">
125 <option value="">No</option>
126 <option value="text">Enter accessions</option>
127 <option value="file">Read a list of accessions from a dataset</option>
128 </param>
129 <when value=""/>
130 <when value="text">
131 <param argument="--fasta-filter" type="text" label="RefSeq nucleotide and protein accessions" help="Comma separated">
132 <sanitizer invalid_char="">
133 <valid initial="string.letters,string.digits">
134 <add value="," />
135 </valid>
136 </sanitizer>
137 </param>
138 </when>
139 <when value="file">
140 <param argument="--fasta-filter-file" type="data" format="txt" label="Dataset with list of RefSeq nucleotide and protein accessions" help=""/>
141 </when>
142 </conditional>
143 </section>
144 <section name="file_choices" title="Output options" expanded="true">
145 <conditional name="kingdom_cond">
146 <param name="kingdom_sel" type="select" label="Kingdom" help="Prokaryotic: Accessions starting with WP_. Data report has a different format and the rna, cds, 3/5' UTR and gene-product report are not suported. ">
147 <option value="gene">Eukaryote</option>
148 <option value="prok-gene">Prokaryote</option>
149 </param>
150 <when value="gene">
151 <expand macro="gene_tsv_report_columns">
152 <option value="gene-id" selected="true">NCBI GeneID</option>
153 <option value="gene-type" selected="true">Gene Type</option>
154 <option value="common-name" selected="true">Common Name</option>
155 <option value="description" selected="true">Description</option>
156 <option value="symbol" selected="true">Symbol</option>
157 <option value="synonyms" selected="true">Synonyms</option>
158 <option value="tax-id" selected="true">Taxonomic ID</option>
159 <option value="tax-name" selected="true">Taxonomic Name</option>
160 </expand>
161 <expand macro="include">
162 <expand macro="gene_includes">
163 <option value="rna" selected="true">transcript (rna)</option>
164 <option value="cds">nucleotide coding sequences (cds)</option>
165 <option value="5p-utr">5'-UTR (5p-utr)</option>
166 <option value="3p-utr">3'-UTR (3p-utr)</option>
167 <option value="product-report"> (product-report)</option>
168 </expand>
169 </expand>
170 </when>
171 <when value="prok-gene">
172 <expand macro="prok_gene_tsv_report_columns">
173 <option value="accession" selected="true">Accession</option>
174 <option value="description" selected="true">Description</option>
175 <option value="ec-number" selected="true">EC Number</option>
176 <option value="gene-symbol" selected="true">Gene Symbol</option>
177 <option value="mapping-count" selected="true">Number of Genome Mappings</option>
178 <option value="protein-length" selected="true">Protein Length</option>
179 <option value="protein-name" selected="true">Protein Name</option>
180 </expand>
181 <expand macro="include">
182 <expand macro="gene_includes"/>
183 </expand>
184 </when>
185 </conditional>
186 <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/>
187 </section>
188 </inputs>
189 <outputs>
190 <data name="gene_data_report" format="tabular" label="NCBI Gene Datasets: Data Report" from_work_dir="gene_data_report.tsv"/>
191 <data name="gene_product_report" format="tabular" label="NCBI Gene Datasets: Product Report" from_work_dir="gene_product_report.tsv">
192 <filter>file_choices['kingdom_cond']['include'] and "product-report" in file_choices['kingdom_cond']['include']</filter>
193 </data>
194 <data name="gene_fasta" label="NCBI Gene Datasets: Gene fasta" format="fasta" from_work_dir="ncbi_dataset/data/gene.fna">
195 <filter>file_choices['kingdom_cond']['include'] and "gene" in file_choices['kingdom_cond']['include']</filter>
196 </data>
197 <data name="rna_fasta" label="NCBI Gene Datasets: RNA fasta" format="fasta" from_work_dir="ncbi_dataset/data/rna.fna">
198 <filter>file_choices['kingdom_cond']['include'] and "rna" in file_choices['kingdom_cond']['include']</filter>
199 </data>
200 <data name="protein_fasta" label="NCBI Gene Datasets: protein fasta" format="fasta" from_work_dir="ncbi_dataset/data/protein.faa">
201 <filter>file_choices['kingdom_cond']['include'] and "protein" in file_choices['kingdom_cond']['include']</filter>
202 </data>
203 <data name="cds_fasta" label="NCBI Gene Datasets: CDS fasta" format="fasta" from_work_dir="ncbi_dataset/data/cds.fna">
204 <filter>file_choices['kingdom_cond']['include'] and "cds" in file_choices['kingdom_cond']['include']</filter>
205 </data>
206 <data name="threep_utr_fasta" label="NCBI Gene Datasets: 3' UTR fasta" format="fasta" from_work_dir="ncbi_dataset/data/3p_utr.fna">
207 <filter>file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include']</filter>
208 </data>
209 <data name="fivep_utr_fasta" label="NCBI Gene Datasets: 5' UTR fasta" format="fasta" from_work_dir="ncbi_dataset/data/5p_utr.fna">
210 <filter>file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include']</filter>
211 </data>
212 </outputs>
213 <tests>
214 <!-- 1: datasets download gene gene-id 672 -->
215 <test expect_num_outputs="3">
216 <conditional name="query|subcommand">
217 <param name="download_by" value="gene-id"/>
218 <conditional name="text_or_file">
219 <param name="text_or_file" value="text"/>
220 <param name="accession" value="672"/>
221 </conditional>
222 </conditional>
223 <output name="gene_data_report">
224 <assert_contents>
225 <has_text text="human"/>
226 <has_text text="BRCA1"/>
227 <has_n_lines n="2"/>
228 <has_n_columns n="8"/>
229 </assert_contents>
230 </output>
231 <output name="rna_fasta">
232 <assert_contents>
233 <has_text text=">"/>
234 </assert_contents>
235 </output>
236 <output name="protein_fasta">
237 <assert_contents>
238 <has_text text=">"/>
239 </assert_contents>
240 </output>
241 </test>
242 <!-- 2: datasets download gene gene-id 2597 14433 -->
243 <test expect_num_outputs="3">
244 <conditional name="query|subcommand">
245 <param name="download_by" value="gene-id"/>
246 <conditional name="text_or_file">
247 <param name="text_or_file" value="text"/>
248 <param name="accession" value="2597,14433"/>
249 </conditional>
250 </conditional>
251 <output name="gene_data_report">
252 <assert_contents>
253 <has_text text="house mouse"/>
254 <has_text text="glyceraldehyde-3-phosphate dehydrogenase"/>
255 <has_n_lines n="3"/>
256 <has_n_columns n="8"/>
257 </assert_contents>
258 </output>
259 <output name="rna_fasta">
260 <assert_contents>
261 <has_text text=">"/>
262 </assert_contents>
263 </output>
264 <output name="protein_fasta">
265 <assert_contents>
266 <has_text text=">"/>
267 </assert_contents>
268 </output>
269 </test>
270 <!-- 3: same as above + give accessions by file, 2 different outputs and ortholog-->
271 <test expect_num_outputs="3">
272 <conditional name="query|subcommand">
273 <param name="download_by" value="gene-id"/>
274 <conditional name="text_or_file">
275 <param name="text_or_file" value="file"/>
276 <param name="inputfile" value="geneids.txt"/>
277 </conditional>
278 <param name="ortholog" value="Haplorrhini,Strepsirrhini"/>
279 </conditional>
280 <section name="file_choices">
281 <conditional name="kingdom_cond">
282 <param name="include" value="gene,cds"/>
283 </conditional>
284 </section>
285 <output name="gene_data_report">
286 <assert_contents>
287 <has_text text="baboon"/>
288 <has_text text="glyceraldehyde-3-phosphate dehydrogenase"/>
289 <has_n_lines n="31"/>
290 <has_n_columns n="8"/>
291 </assert_contents>
292 </output>
293 <output name="gene_fasta">
294 <assert_contents>
295 <has_text text=">"/>
296 </assert_contents>
297 </output>
298 <output name="cds_fasta">
299 <assert_contents>
300 <has_text text=">"/>
301 </assert_contents>
302 </output>
303 </test>
304 <!-- 4: datasets download gene symbol tp53 -->
305 <test expect_num_outputs="1">
306 <conditional name="query|subcommand">
307 <param name="download_by" value="symbol"/>
308 <conditional name="text_or_file">
309 <param name="text_or_file" value="text"/>
310 <param name="accession" value="tp53"/>
311 </conditional>
312 </conditional>
313 <section name="file_choices">
314 <conditional name="kingdom_cond">
315 <param name="include" value=""/>
316 </conditional>
317 </section>
318 <output name="gene_data_report">
319 <assert_contents>
320 <has_text text="human"/>
321 <has_n_lines n="2"/>
322 <has_n_columns n="8"/>
323 </assert_contents>
324 </output>
325 </test>
326 <!-- 5: datasets download gene symbol brca1 \-\-taxon mouse -->
327 <test expect_num_outputs="4">
328 <conditional name="query|subcommand">
329 <param name="download_by" value="symbol"/>
330 <conditional name="text_or_file">
331 <param name="text_or_file" value="text"/>
332 <param name="accession" value="brca1"/>
333 </conditional>
334 <param name="taxon" value="mouse"/>
335 </conditional>
336 <section name="file_choices">
337 <conditional name="kingdom_cond">
338 <param name="include" value="3p-utr,5p-utr,product-report"/>
339 </conditional>
340 </section>
341 <output name="gene_data_report">
342 <assert_contents>
343 <has_text text="house mouse"/>
344 <has_text text="Brca1"/>
345 <has_n_lines n="2"/>
346 <has_n_columns n="8"/>
347 </assert_contents>
348 </output>
349 <output name="gene_product_report">
350 <assert_contents>
351 <has_text text="house mouse"/>
352 <has_text text="XR_004936704.1"/>
353 <has_n_lines n="137"/>
354 <has_n_columns n="38"/>
355 </assert_contents>
356 </output>
357 <output name="threep_utr_fasta">
358 <assert_contents>
359 <has_text text=">"/>
360 </assert_contents>
361 </output>
362 <output name="fivep_utr_fasta">
363 <assert_contents>
364 <has_text text=">"/>
365 </assert_contents>
366 </output>
367 </test>
368 <!-- 6: datasets download gene symbol brca1 \-\-ortholog -->
369 <test expect_num_outputs="1">
370 <conditional name="query|subcommand">
371 <param name="download_by" value="symbol"/>
372 <conditional name="text_or_file">
373 <param name="text_or_file" value="text"/>
374 <param name="accession" value="brca1"/>
375 </conditional>
376 <param name="ortholog" value="rodentia"/>
377 </conditional>
378 <section name="file_choices">
379 <conditional name="kingdom_cond">
380 <param name="include" value=""/>
381 </conditional>
382 </section>
383 <output name="gene_data_report">
384 <assert_contents>
385 <has_text text="rat"/>
386 <has_text text="Brca1"/>
387 <has_n_lines n="38"/>
388 <has_n_columns n="8"/>
389 </assert_contents>
390 </output>
391 </test>
392
393 <!-- 7: datasets download gene accession NP_000483.3 -->
394 <test expect_num_outputs="1">
395 <conditional name="query|subcommand">
396 <param name="download_by" value="accession"/>
397 <conditional name="text_or_file">
398 <param name="text_or_file" value="text"/>
399 <param name="accession" value="NP_000483.3"/>
400 </conditional>
401 </conditional>
402 <section name="file_choices">
403 <conditional name="kingdom_cond">
404 <param name="include" value=""/>
405 </conditional>
406 </section>
407 <output name="gene_data_report">
408 <assert_contents>
409 <has_text text="human"/>
410 <has_n_lines n="2"/>
411 <has_n_columns n="8"/>
412 </assert_contents>
413 </output>
414 </test>
415 <!-- 8: datasets download gene accession NM_000546.6 NM_000492.4 + ortholog-->
416 <test expect_num_outputs="1">
417 <conditional name="query|subcommand">
418 <param name="download_by" value="accession"/>
419 <conditional name="text_or_file">
420 <param name="text_or_file" value="text"/>
421 <param name="accession" value="NM_000546.6 NM_000492.4"/>
422 </conditional>
423 <param name="ortholog" value="true"/>
424 </conditional>
425 <section name="file_choices">
426 <conditional name="kingdom_cond">
427 <param name="include" value=""/>
428 </conditional>
429 </section>
430 <output name="gene_data_report">
431 <assert_contents>
432 <has_text text="human"/>
433 <has_n_lines n="823"/>
434 <has_n_columns n="8"/>
435 </assert_contents>
436 </output>
437 </test>
438
439 <!-- 9: datasets download gene accession WP_004675351.1 + include_flanks_bp -->
440 <test expect_num_outputs="3">
441 <conditional name="query|subcommand">
442 <param name="download_by" value="accession"/>
443 <conditional name="text_or_file">
444 <param name="text_or_file" value="text"/>
445 <param name="accession" value="WP_004675351.1"/>
446 </conditional>
447 <param name="include_flanks_bp" value="100"/>
448 </conditional>
449 <section name="file_choices">
450 <conditional name="kingdom_cond">
451 <param name="kingdom_sel" value="prok-gene"/>
452 <param name="include" value="gene,protein"/>
453 </conditional>
454 </section>
455 <output name="gene_data_report">
456 <assert_contents>
457 <has_text text="glcE"/>
458 <has_n_lines n="2"/>
459 <has_n_columns n="7"/>
460 </assert_contents>
461 </output>
462 <output name="gene_fasta">
463 <assert_contents>
464 <has_text text=">"/>
465 </assert_contents>
466 </output>
467 <output name="protein_fasta">
468 <assert_contents>
469 <has_text text=">"/>
470 </assert_contents>
471 </output>
472 <assert_command>
473 <has_text text="include-flanks-bp 100"/>
474 </assert_command>
475 </test>
476
477 <!-- 10: datasets download gene taxon human -->
478 <test expect_num_outputs="1">
479 <conditional name="query|subcommand">
480 <param name="download_by" value="taxon"/>
481 <param name="taxon_positional" value="human"/>
482 </conditional>
483 <section name="file_choices">
484 <conditional name="kingdom_cond">
485 <param name="include" value=""/>
486 </conditional>
487 </section>
488 <output name="gene_data_report">
489 <assert_contents>
490 <has_text text="human"/>
491 <has_n_lines n="72533"/>
492 <has_n_columns n="8"/>
493 </assert_contents>
494 </output>
495 </test>
496 <!-- 11: datasets download gene taxon human + \-\-fasta-filter -->
497 <test expect_num_outputs="2">
498 <conditional name="query|subcommand">
499 <param name="download_by" value="taxon"/>
500 <param name="taxon_positional" value="human"/>
501 </conditional>
502 <section name="file_choices">
503 <conditional name="kingdom_cond">
504 <param name="include" value="protein"/>
505 </conditional>
506 </section>
507 <section name="filters">
508 <conditional name="fasta_filter_cond">
509 <param name="fasta_filter_select" value="text"/>
510 <param name="fasta_filter" value="NP_542432.2"/>
511 </conditional>
512 </section>
513 <output name="gene_data_report">
514 <assert_contents>
515 <has_text text="human"/>
516 <has_n_lines n="72533"/>
517 <has_n_columns n="8"/>
518 </assert_contents>
519 </output>
520 <output name="protein_fasta">
521 <assert_contents>
522 <has_text text=">" n="1" />
523 </assert_contents>
524 </output></test>
525 </tests>
526 <help>
527 <![CDATA[
528 **Download Gene Datasets from NCBI**
529
530 Download a gene dataset (gene sequence, transcipt, amino acid sequences,
531 nucleotide coding sequences, 5'-UTR, 3'-UTR) as well as gene and gene
532 product reports. Genes can be referred by gene id, symbol, accession,
533 or taxon.
534 ]]>
535 </help>
536 </tool>