Galaxy |

Changeset 11:ac24fff14f23 (2022-12-02)

Previous changeset 10:a3395b1d871b (2022-11-21) Next changeset 12:d78faac2c6ef (2022-12-03)

Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 4d7d3a56084e140f4fa63fb0e04a08b732f247f2

modified:
datasets_genome.xml
macros.xml

added:
datasets_gene.xml
test-data/geneids.txt

diff -r a3395b1d871b -r ac24fff14f23 datasets_gene.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datasets_gene.xml Fri Dec 02 10:52:48 2022 +0000

[

b'@@ -0,0 +1,536 @@\n+<tool id="datasets_download_gene" name="NCBI Datasets Gene" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">\n+ <description>download gene sequences and metadata</description>\n+ <macros>\n+ <import>macros.xml</import>\n+ </macros>\n+ <expand macro="requirements"></expand>\n+ <command><![CDATA[\n+#import re\n+@SETUP_CERTIFICATES@\n+datasets download gene $query.subcommand.download_by\n+#if $query.subcommand.download_by == \'taxon\':\n+ \'$query.subcommand.taxon_positional\'\n+#else:\n+ #if $query.subcommand.text_or_file.text_or_file == \'text\':\n+ #echo " ".join(f"\'{x}\'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)\n+ #else\n+ --inputfile \'$query.subcommand.text_or_file.inputfile\'\n+ #end if\n+#end if\n+\n+#if $query.subcommand.download_by != \'taxon\' and $query.subcommand.ortholog:\n+ --ortholog \'$query.subcommand.ortholog\'\n+#end if\n+\n+#if $query.subcommand.download_by == \'symbol\':\n+ #if $query.subcommand.taxon\n+ --taxon \'$query.subcommand.taxon\'\n+ #end if\n+#end if\n+\n+#if $query.subcommand.download_by == \'accession\':\n+ #if $query.subcommand.taxon_filter\n+ --taxon-filter \'$query.subcommand.taxon_filter\'\n+ #end if\n+ #if str($query.subcommand.include_flanks_bp)\n+ --include-flanks-bp $query.subcommand.include_flanks_bp\n+ #end if\n+#end if\n+\n+#if $filters.fasta_filter_cond.fasta_filter_select\n+ #if $filters.fasta_filter_cond.fasta_filter_select == \'text\'\n+ --fasta-filter #echo ",".join(f"\'{x}\'" for x in $filters.fasta_filter_cond.fasta_filter.split(\',\') if x)\n+ #else\n+ --fasta-filter-file \'$filters.fasta_filter_cond.fasta_filter_file\'\n+ #end if\n+#end if\n+\n+--include\n+#if $file_choices.kingdom_cond.include\n+ #echo ",".join($file_choices.kingdom_cond.include)\n+#else\n+ none\n+#end if\n+\n+--no-progressbar\n+\n+## produce TSV report file (either gene or prok-gene)\n+&& \n+dataformat\n+ tsv\n+ $file_choices.kingdom_cond.kingdom_sel\n+ --package ncbi_dataset.zip\n+ --fields #echo ",".join($file_choices.kingdom_cond.report_columns)\n+ > gene_data_report.tsv\n+## if ! dataformat tsv gene --package ncbi_dataset.zip > gene_data_report.tsv 2> dataformat.log; then\n+## dataformat tsv prok-gene --package ncbi_dataset.zip > gene_data_report.tsv 2>> dataformat.log;\n+## fi\n+\n+#if $file_choices.kingdom_cond.include and "product-report" in $file_choices.kingdom_cond.include\n+ && dataformat tsv gene-product --package ncbi_dataset.zip > gene_product_report.tsv\n+#end if\n+\n+## unzip and rehydrate if any data is to be downloaded (include is not None)\n+#if $file_choices.kingdom_cond.include\n+ ## unzip\n+ && 7z x -y ncbi_dataset.zip > 7z.log\n+#end if\n+]]></command>\n+ <inputs>\n+ <section name="query" title="Query" expanded="true">\n+ <conditional name="subcommand">\n+ <param name="download_by" type="select" label="Choose how to find genes to download">\n+ <option value="gene-id">By NCBI Gene ID</option>\n+ <option value="symbol">By Gene symbol</option>\n+ <option value="accession">By RefSeq nucleotide or protein accession</option>\n+ <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option>\n+ </param>\n+ <when value="gene-id">\n+ <expand macro="text_or_file" what="Gene ID" what_extended="NCBI Gene ID" help=""/>\n+ <expand macro="ortholog"/>\n+ </when>\n+ <when value="symbol">\n+ <expand macro="text_or_file" what="Gene Symbol" what_extended="NCBI Gene Symbol" help=""/>\n+ <expand macro="ortholog"/>\n+ <param argument="--taxon" type="text" value="human" label="Species for gene symbol" help="NCBI taxid, common or scientific name">\n+ <sanitizer invalid_'..b' datasets download gene accession WP_004675351.1 + include_flanks_bp -->\n+ <test expect_num_outputs="3">\n+ <conditional name="query|subcommand">\n+ <param name="download_by" value="accession"/>\n+ <conditional name="text_or_file">\n+ <param name="text_or_file" value="text"/>\n+ <param name="accession" value="WP_004675351.1"/>\n+ </conditional>\n+ <param name="include_flanks_bp" value="100"/>\n+ </conditional>\n+ <section name="file_choices">\n+ <conditional name="kingdom_cond">\n+ <param name="kingdom_sel" value="prok-gene"/>\n+ <param name="include" value="gene,protein"/>\n+ </conditional>\n+ </section>\n+ <output name="gene_data_report">\n+ <assert_contents>\n+ <has_text text="glcE"/>\n+ <has_n_lines n="2"/>\n+ <has_n_columns n="7"/>\n+ </assert_contents>\n+ </output>\n+ <output name="gene_fasta">\n+ <assert_contents>\n+ <has_text text=">"/>\n+ </assert_contents>\n+ </output>\n+ <output name="protein_fasta">\n+ <assert_contents>\n+ <has_text text=">"/>\n+ </assert_contents>\n+ </output>\n+ <assert_command>\n+ <has_text text="include-flanks-bp 100"/>\n+ </assert_command>\n+ </test> \n+\n+ \n+ <test expect_num_outputs="1">\n+ <conditional name="query|subcommand">\n+ <param name="download_by" value="taxon"/>\n+ <param name="taxon_positional" value="human"/>\n+ </conditional>\n+ <section name="file_choices">\n+ <conditional name="kingdom_cond">\n+ <param name="include" value=""/>\n+ </conditional>\n+ </section>\n+ <output name="gene_data_report">\n+ <assert_contents>\n+ <has_text text="human"/>\n+ <has_n_lines n="72533"/>\n+ <has_n_columns n="8"/>\n+ </assert_contents>\n+ </output>\n+ </test> \n+ \n+ <test expect_num_outputs="2">\n+ <conditional name="query|subcommand">\n+ <param name="download_by" value="taxon"/>\n+ <param name="taxon_positional" value="human"/>\n+ </conditional>\n+ <section name="file_choices">\n+ <conditional name="kingdom_cond">\n+ <param name="include" value="protein"/>\n+ </conditional>\n+ </section>\n+ <section name="filters">\n+ <conditional name="fasta_filter_cond">\n+ <param name="fasta_filter_select" value="text"/>\n+ <param name="fasta_filter" value="NP_542432.2"/>\n+ </conditional>\n+ </section>\n+ <output name="gene_data_report">\n+ <assert_contents>\n+ <has_text text="human"/>\n+ <has_n_lines n="72533"/>\n+ <has_n_columns n="8"/>\n+ </assert_contents>\n+ </output>\n+ <output name="protein_fasta">\n+ <assert_contents>\n+ <has_text text=">" n="1" />\n+ </assert_contents>\n+ </output></test>\n+ </tests>\n+ <help>\n+<![CDATA[\n+**Download Gene Datasets from NCBI**\n+\n+Download a gene dataset (gene sequence, transcipt, amino acid sequences, \n+nucleotide coding sequences, 5\'-UTR, 3\'-UTR) as well as gene and gene\n+product reports. Genes can be referred by gene id, symbol, accession,\n+or taxon.\n+]]>\n+ </help>\n+</tool>\n'

diff -r a3395b1d871b -r ac24fff14f23 datasets_genome.xml
--- a/datasets_genome.xml Mon Nov 21 11:40:05 2022 +0000
+++ b/datasets_genome.xml Fri Dec 02 10:52:48 2022 +0000

[

b'@@ -5,16 +5,17 @@\n </macros>\n <expand macro="requirements"></expand>\n <command><![CDATA[\n+#import re\n @SETUP_CERTIFICATES@\n datasets download genome $query.subcommand.download_by\n #if $query.subcommand.download_by == \'accession\':\n #if $query.subcommand.text_or_file.text_or_file == \'text\':\n- #echo " ".join(f"\'{x}\'" for x in $query.subcommand.text_or_file.accession.split(\' \') if x)\n+ #echo " ".join(f"\'{x}\'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)\n #else\n --inputfile \'$query.subcommand.text_or_file.inputfile\'\n #end if\n #else:\n- \'$query.subcommand.taxon\'\n+ \'$query.subcommand.taxon_positional\'\n $query.subcommand.tax_exact_match\n #end if\n $filters.reference\n@@ -37,24 +38,55 @@\n --search \'$filters.search_term\'\n #end for\n --no-progressbar\n-#if $uncompressed\n-&& 7z x -y ncbi_dataset.zip\n-#else\n-&& 7z l ncbi_dataset.zip > ncbi_dataset.txt\n+--dehydrated\n+\n+## produce TSV report file\n+&& dataformat tsv genome \n+ --package ncbi_dataset.zip\n+ --fields #echo ",".join($file_choices.report_columns) \n+ > genome_data_report.tsv\n+\n+## unzip and rehydrate if any data is to be downloaded (include is not None)\n+#if $file_choices.include\n+ ## unzip\n+ && 7z x -y ncbi_dataset.zip > 7z.log\n+\n+ ## rehydrate\n+ && datasets rehydrate\n+ --directory ./\n+ #if not $file_choices.decompress\n+ --gzip\n+ #end if\n+ --max-workers \\${NCBI_DATASETS_MAX_WORKERS:-10}\n+\n+ ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery\n+ && find ncbi_dataset \$ -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \$ -exec sh -c \'mv {} \\$(echo {} | sed "s/.f[an]a\$.gz\$\\?\\$/.fasta\\1/")\' \\;\n+\n+ ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)\n+ && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c \'mv {} \\$(dirname {})/\\$(basename {} .gz)\' \\;\n+ #if $file_choices.decompress\n+ && find ncbi_dataset \$ -name "*.gz" ! -name "*fasta.gz" \$ -exec gunzip {} \\;\n+ #end if\n+\n+ #if "seq-report" in $file_choices.include\n+ && find ncbi_dataset -name sequence_report.jsonl -exec sh -c \'dataformat tsv genome-seq --inputfile {} > \\$(dirname {})/\\$(basename {} .jsonl).tsv\' \\;\n+ #end if\n+ \n+ && true ## because Galaxy removes trailing ; from command\n #end if\n ]]></command>\n <inputs>\n <section name="query" title="Query" expanded="true">\n <conditional name="subcommand">\n <param name="download_by" type="select" label="Choose how to find genomes to download">\n- <option value="accession">Download by NCBI assembly or BioProject accession</option>\n- <option value="taxon">Download by taxon</option>\n+ <option value="accession">By NCBI assembly or BioProject accession</option>\n+ <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option>\n </param>\n <when value="accession">\n <expand macro="text_or_file"/>\n </when>\n <when value="taxon">\n- <param name="taxon" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."/>\n+ <expand macro="taxon_positional"/>\n <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/>\n </when>\n </conditional>\n@@ -67,7 +99,6 @@\n <option value="latest">Latest</option>\n <option value="all">All</option>\n </param>\n- \n <expand macro="assembly_source"/>\n <expand macro="chromosomes"/>\n '..b'ion name="protein_fasta" type="list" count="1">\n+ <element name="GCF_000146045.2" ftype="fasta.gz">\n+ <assert_contents>\n+ <has_size value="1844838"/>\n+ </assert_contents>\n+ </element>\n+ </output_collection>\n+ <output_collection name="rna_fasta" type="list" count="1">\n+ <element name="GCF_000146045.2" ftype="fasta.gz">\n+ <assert_contents>\n+ <has_size value="2784534"/>\n+ </assert_contents>\n+ </element>\n+ </output_collection>\n </test>\n <test expect_num_outputs="3">\n <conditional name="query|subcommand">\n@@ -307,8 +415,11 @@\n <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/>\n </conditional>\n </conditional>\n- <param name="include" value="seq-report,genome"/>\n- <param name="uncompressed" value="true"/>\n+ <section name="file_choices">\n+ <param name="include" value="seq-report,genome"/>\n+ <param name="decompress" value="true"/>\n+ </section>\n+ <output_collection name="sequence_report" type="list" count="2"/>\n <output_collection name="genome_fasta" type="list:list" count="2">\n <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/>\n <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>\n@@ -320,12 +431,10 @@\n <test expect_num_outputs="1" expect_test_failure="true">\n <conditional name="query|subcommand">\n <param name="download_by" value="taxon"/>\n- <param name="text_or_file" value="text"/>\n- <param name="taxon" value="4932"/>\n+ <param name="taxon_positional" value="4932"/>\n <param name="tax_exact_match" value="true"/>\n </conditional>\n <param name="include" value=""/>\n- <param name="uncompressed" value="true"/>\n <output name="genome_data_report">\n <assert_contents>\n <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>\n@@ -338,15 +447,18 @@\n **Download Genome Datasets from NCBI**\n \n Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.\n-Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file.\n+Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon.\n+\n+The download is a three step process:\n \n-Tthe default genome dataset includes the following files (if available):\n- * data_report.jsonl (genome assembly and annotation metadata, not always available)\n- * genomic.fna (genomic sequences)\n- * rna.fna (transcript sequences)\n- * protein.faa (protein sequences)\n- * genomic.gff (genome annotation in gff3 format)\n- * dataset_catalog.json (a list of files and file types included in the dataset)\n+1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL)\n+2. The metadata is transformed into a tabular (TSV) file\n+3. The data is hydrated (the actual data is downloaded)\n+\n+The 3rd step can be skipped by unselecting all output types in the `Include` parameter.\n+Thereby its possible to inspect the metadata prior to the actual data download. Also this\n+allows to use the tool for querying data sets (and their accessions) of interest which\n+can then be downloaded in a second call using the accessions.\n ]]>\n </help>\n \n'

diff -r a3395b1d871b -r ac24fff14f23 macros.xml
--- a/macros.xml Mon Nov 21 11:40:05 2022 +0000
+++ b/macros.xml Fri Dec 02 10:52:48 2022 +0000

[

b'@@ -1,5 +1,5 @@\n <macros>\n- <token name="@TOOL_VERSION@">14.3</token>\n+ <token name="@TOOL_VERSION@">14.4</token>\n <token name="@VERSION_SUFFIX@">0</token>\n <token name="@PROFILE@">21.01</token>\n <token name="@LICENSE@">MIT</token>\n@@ -39,8 +39,10 @@\n <option value="file">Read a list of @WHAT_EXTENDED@s from a dataset</option>\n </param>\n <when value="text">\n- <param name="accession" type="text" label="Enter space separated list of @WHAT@s" help="@HELP@">\n- <yield/>\n+ \n+ <param name="accession" type="text" label="Enter comma separated list of @WHAT@s" help="@HELP@">\n+ <validator type="length" min="1" message="Provide at least one @WHAT@"/>\n </param>\n </when>\n <when value="file">\n@@ -59,18 +61,45 @@\n </sanitizer>\n </param>\n </xml>\n- <xml name="include">\n- <param argument="--include" type="select" multiple="true" optional="true">\n- <option value="genome" selected="true">genomic sequence (genome)</option>\n- <option value="rna">transcript (rna)</option>\n- <option value="protein">amnio acid sequences (protein)</option>\n- <option value="cds">nucleotide coding sequences (cds)</option>\n- <option value="gff3">general feature file (gff3)</option>\n- <option value="gtf">gene transfer format (gtf)</option>\n- <option value="gbff">GenBank flat file (gbff)</option>\n- <option value="seq-report">sequence report file (seq-report)</option>\n+\n+ <xml name="taxon_positional">\n+ <param name="taxon_positional" type="text" label="Enter taxon" help="e.g. human, mouse, bos taurus, etc."/>\n+ </xml>\n+\n+ <xml name="ortholog">\n+ <param argument="--ortholog" type="text" label="Retrieve orthologs for taxa" help="Retrieve data for an ortholog set. Provide one or more comma separated taxa (any rank) to filter results or \'all\' for the complete set.">\n+ <sanitizer invalid_char="">\n+ <valid initial="string.letters,string.digits">\n+ <add value=" " />\n+ <add value="," />\n+ <add value="-" />\n+ </valid>\n+ </sanitizer>\n </param>\n </xml>\n+\n+ <xml name="include">\n+ <param argument="--include" type="select" multiple="true" optional="true" label="Include" help="Download the following datasets (if available)">\n+ <yield/>\n+ </param>\n+ </xml>\n+ <xml name="genome_includes">\n+ <option value="genome" selected="true">genomic sequence (genome)</option>\n+ <option value="rna">transcript (rna)</option>\n+ <option value="protein">amnio acid sequences (protein)</option>\n+ <option value="cds">nucleotide coding sequences (cds)</option>\n+ <option value="gff3">general feature file (gff3)</option>\n+ <option value="gtf">gene transfer format (gtf)</option>\n+ <option value="gbff">GenBank flat file (gbff)</option>\n+ <option value="seq-report">sequence report file (seq-report)</option>\n+ <yield/>\n+ </xml>\n+ <xml name="gene_includes">\n+ <option value="gene">gene sequence (gene)</option>\n+ <option value="protein" selected="true">amnio acid sequences (protein)</option>\n+ <yield/>\n+ </xml>\n+\n <token name="@INCLUDE@"><![CDATA[\n --include\n #if $file_choices.include\n@@ -79,6 +108,187 @@\n none\n #end if\n ]]></token>\n+ <xml name="tsv_report_columns">\n+ <param name="report_columns" type="select" multiple="true" optional="false" label="Columns in the report">\n+ <option value="accession">Assembly Accession</option>\n+ '..b' <option value="genomic-region-gene-range-range-order">Genomic Region Gene Range Order</option>\n+ <option value="genomic-region-gene-range-range-orientation">Genomic Region Gene Range Orientation</option>\n+ <option value="genomic-region-gene-range-range-start">Genomic Region Gene Range Start</option>\n+ <option value="genomic-region-gene-range-range-stop">Genomic Region Gene Range Stop</option>\n+ <option value="genomic-region-genomic-region-type">Genomic Region Genomic Region Type</option>\n+ <option value="group-id">Gene Group Identifier</option>\n+ <option value="group-method">Gene Group Method</option>\n+ <option value="name-authority">Nomenclature Authority</option>\n+ <option value="name-id">Nomenclature ID</option>\n+ <option value="omim-ids">OMIM IDs</option>\n+ <option value="orientation">Orientation</option>\n+ <option value="ref-standard-gene-range-accession">Reference Standard Gene Range Sequence Accession</option>\n+ <option value="ref-standard-gene-range-range-order">Reference Standard Gene Range Order</option>\n+ <option value="ref-standard-gene-range-range-orientation">Reference Standard Gene Range Orientation</option>\n+ <option value="ref-standard-gene-range-range-start">Reference Standard Gene Range Start</option>\n+ <option value="ref-standard-gene-range-range-stop">Reference Standard Gene Range Stop</option>\n+ <option value="ref-standard-genomic-region-type">Reference Standard Genomic Region Type</option>\n+ <option value="replaced-gene-id">Replaced NCBI GeneID</option>\n+ <option value="rna-type">RNA Type</option>\n+ <option value="swissprot-accessions">SwissProt Accessions</option>\n+ <option value="symbol">Symbol</option>\n+ <option value="synonyms">Synonyms</option>\n+ <option value="tax-id">Taxonomic ID</option>\n+ <option value="tax-name">Taxonomic Name</option>\n+ <yield/>\n+ </param>\n+ </xml>\n+ <xml name="prok_gene_tsv_report_columns">\n+ <param name="report_columns" type="select" multiple="true" optional="false" label="Columns in the report">\n+ <option value="accession">Accession</option>\n+ <option value="description">Description</option>\n+ <option value="ec-number">EC Number</option>\n+ <option value="gene-symbol">Gene Symbol</option>\n+ <option value="mapping-count">Number of Genome Mappings</option>\n+ <option value="name-evidence-accession">Protein Name EvidenceAccession</option>\n+ <option value="name-evidence-category">Protein Name EvidenceCategory</option>\n+ <option value="name-evidence-source">Protein Name EvidenceSource</option>\n+ <option value="protein-length">Protein Length</option>\n+ <option value="protein-name">Protein Name</option>\n+ <yield/>\n+ </param>\n+ </xml>\n <xml name="released_options" token_released_what="genomes" token_before_or_after="before">\n <param argument="--released-@BEFORE_OR_AFTER@" type="text" optional="true" label="Only include @RELEASED_WHAT@ that have been released @BEFORE_OR_AFTER@ a specified date (MM/DD/YYYY)">\n <validator type="regex" message="enter a date in the form MM/DD/YYYY">[0-9]{2}/[0-9]{2}/[0-9]{4}</validator>\n@@ -93,9 +303,9 @@\n #end if\n </token>\n \n- <xml name="genome_fasta_assert" tokens="el1,el2,expression" token_expression_n="1">\n+ <xml name="genome_fasta_assert" tokens="el1,el2,expression" token_ftype="fasta" token_expression_n="1">\n <element name="@EL1@">\n- <element name="@EL2@">\n+ <element name="@EL2@" ftype="@FTYPE@" decompress="true">\n <assert_contents>\n <has_text_matching expression="@EXPRESSION@" n="@EXPRESSION_N@"/>\n </assert_contents>\n'

diff -r a3395b1d871b -r ac24fff14f23 test-data/geneids.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/geneids.txt Fri Dec 02 10:52:48 2022 +0000

@@ -0,0 +1,2 @@
+2597
+14433