# HG changeset patch
# User iuc
# Date 1669978368 0
# Node ID ac24fff14f23294772a18662d2ab709a892c2cf6
# Parent a3395b1d871b2c0b3c51ccb08bfe6ee68ebe9176
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 4d7d3a56084e140f4fa63fb0e04a08b732f247f2
diff -r a3395b1d871b -r ac24fff14f23 datasets_gene.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datasets_gene.xml Fri Dec 02 10:52:48 2022 +0000
@@ -0,0 +1,536 @@
+
+ download gene sequences and metadata
+
+ macros.xml
+
+
+ gene_data_report.tsv
+## if ! dataformat tsv gene --package ncbi_dataset.zip > gene_data_report.tsv 2> dataformat.log; then
+## dataformat tsv prok-gene --package ncbi_dataset.zip > gene_data_report.tsv 2>> dataformat.log;
+## fi
+
+#if $file_choices.kingdom_cond.include and "product-report" in $file_choices.kingdom_cond.include
+ && dataformat tsv gene-product --package ncbi_dataset.zip > gene_product_report.tsv
+#end if
+
+## unzip and rehydrate if any data is to be downloaded (include is not None)
+#if $file_choices.kingdom_cond.include
+ ## unzip
+ && 7z x -y ncbi_dataset.zip > 7z.log
+#end if
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ file_choices['kingdom_cond']['include'] and "product-report" in file_choices['kingdom_cond']['include']
+
+
+ file_choices['kingdom_cond']['include'] and "gene" in file_choices['kingdom_cond']['include']
+
+
+ file_choices['kingdom_cond']['include'] and "rna" in file_choices['kingdom_cond']['include']
+
+
+ file_choices['kingdom_cond']['include'] and "protein" in file_choices['kingdom_cond']['include']
+
+
+ file_choices['kingdom_cond']['include'] and "cds" in file_choices['kingdom_cond']['include']
+
+
+ file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include']
+
+
+ file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include']
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r a3395b1d871b -r ac24fff14f23 datasets_genome.xml
--- a/datasets_genome.xml Mon Nov 21 11:40:05 2022 +0000
+++ b/datasets_genome.xml Fri Dec 02 10:52:48 2022 +0000
@@ -5,16 +5,17 @@
ncbi_dataset.txt
+--dehydrated
+
+## produce TSV report file
+&& dataformat tsv genome
+ --package ncbi_dataset.zip
+ --fields #echo ",".join($file_choices.report_columns)
+ > genome_data_report.tsv
+
+## unzip and rehydrate if any data is to be downloaded (include is not None)
+#if $file_choices.include
+ ## unzip
+ && 7z x -y ncbi_dataset.zip > 7z.log
+
+ ## rehydrate
+ && datasets rehydrate
+ --directory ./
+ #if not $file_choices.decompress
+ --gzip
+ #end if
+ --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10}
+
+ ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery
+ && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;
+
+ ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
+ && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
+ #if $file_choices.decompress
+ && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
+ #end if
+
+ #if "seq-report" in $file_choices.include
+ && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \;
+ #end if
+
+ && true ## because Galaxy removes trailing ; from command
#end if
]]>
-
-
+
+
-
+
@@ -67,7 +99,6 @@
-
@@ -78,82 +109,93 @@
-
-
+
+
+
+
+
+
+
+
+
+
+
-
-
- not uncompressed
-
-
- not uncompressed
-
-
- uncompressed
-
+
-
- uncompressed and file_choices['include'] and "seq-report" in file_choices['include']
+
+ file_choices['include'] and "seq-report" in file_choices['include']
-
- uncompressed and file_choices['include'] and "genome" in file_choices['include']
+
+ file_choices['include'] and "genome" in file_choices['include']
-
- uncompressed and file_choices['include'] and "rna" in file_choices['include']
+
+ file_choices['include'] and "rna" in file_choices['include']
-
- uncompressed and file_choices['include'] and "protein" in file_choices['include']
+
+ file_choices['include'] and "protein" in file_choices['include']
-
- uncompressed and file_choices['include'] and "cds" in file_choices['include']
+
+ file_choices['include'] and "cds" in file_choices['include']
- uncompressed and file_choices['include'] and "gff3" in file_choices['include']
+ file_choices['include'] and "gff3" in file_choices['include']
- uncompressed and file_choices['include'] and "gtf" in file_choices['include']
+ file_choices['include'] and "gtf" in file_choices['include']
- uncompressed and file_choices['include'] and "gbff" in file_choices['include']
+ file_choices['include'] and "gbff" in file_choices['include']
-
+
+
-
-
+
-
-
-
-
-
+
-
-
+
@@ -174,22 +216,24 @@
+
-
+
-
-
+
-
-
+
@@ -197,6 +241,8 @@
+
+
@@ -208,21 +254,41 @@
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
@@ -233,12 +299,17 @@
-
-
+
-
+
+
+
+
@@ -250,7 +321,9 @@
-
+
+
+
@@ -258,32 +331,14 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
@@ -293,11 +348,64 @@
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -307,8 +415,11 @@
-
-
+
+
@@ -320,12 +431,10 @@
-
-
+
-
@@ -338,15 +447,18 @@
**Download Genome Datasets from NCBI**
Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
-Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file.
+Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon.
+
+The download is a three step process:
-Tthe default genome dataset includes the following files (if available):
- * data_report.jsonl (genome assembly and annotation metadata, not always available)
- * genomic.fna (genomic sequences)
- * rna.fna (transcript sequences)
- * protein.faa (protein sequences)
- * genomic.gff (genome annotation in gff3 format)
- * dataset_catalog.json (a list of files and file types included in the dataset)
+1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL)
+2. The metadata is transformed into a tabular (TSV) file
+3. The data is hydrated (the actual data is downloaded)
+
+The 3rd step can be skipped by unselecting all output types in the `Include` parameter.
+Thereby its possible to inspect the metadata prior to the actual data download. Also this
+allows to use the tool for querying data sets (and their accessions) of interest which
+can then be downloaded in a second call using the accessions.
]]>
diff -r a3395b1d871b -r ac24fff14f23 macros.xml
--- a/macros.xml Mon Nov 21 11:40:05 2022 +0000
+++ b/macros.xml Fri Dec 02 10:52:48 2022 +0000
@@ -1,5 +1,5 @@
- 14.3
+ 14.4
0
21.01
MIT
@@ -39,8 +39,10 @@
-
-
+
+
+
@@ -59,18 +61,45 @@
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
[0-9]{2}/[0-9]{2}/[0-9]{4}
@@ -93,9 +303,9 @@
#end if
-
+
-
+
diff -r a3395b1d871b -r ac24fff14f23 test-data/geneids.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/geneids.txt Fri Dec 02 10:52:48 2022 +0000
@@ -0,0 +1,2 @@
+2597
+14433