# HG changeset patch # User iuc # Date 1669978368 0 # Node ID ac24fff14f23294772a18662d2ab709a892c2cf6 # Parent a3395b1d871b2c0b3c51ccb08bfe6ee68ebe9176 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 4d7d3a56084e140f4fa63fb0e04a08b732f247f2 diff -r a3395b1d871b -r ac24fff14f23 datasets_gene.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datasets_gene.xml Fri Dec 02 10:52:48 2022 +0000 @@ -0,0 +1,536 @@ + + download gene sequences and metadata + + macros.xml + + + gene_data_report.tsv +## if ! dataformat tsv gene --package ncbi_dataset.zip > gene_data_report.tsv 2> dataformat.log; then +## dataformat tsv prok-gene --package ncbi_dataset.zip > gene_data_report.tsv 2>> dataformat.log; +## fi + +#if $file_choices.kingdom_cond.include and "product-report" in $file_choices.kingdom_cond.include + && dataformat tsv gene-product --package ncbi_dataset.zip > gene_product_report.tsv +#end if + +## unzip and rehydrate if any data is to be downloaded (include is not None) +#if $file_choices.kingdom_cond.include + ## unzip + && 7z x -y ncbi_dataset.zip > 7z.log +#end if +]]> + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + + + + + + + + + + + + + + + + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + file_choices['kingdom_cond']['include'] and "product-report" in file_choices['kingdom_cond']['include'] + + + file_choices['kingdom_cond']['include'] and "gene" in file_choices['kingdom_cond']['include'] + + + file_choices['kingdom_cond']['include'] and "rna" in file_choices['kingdom_cond']['include'] + + + file_choices['kingdom_cond']['include'] and "protein" in file_choices['kingdom_cond']['include'] + + + file_choices['kingdom_cond']['include'] and "cds" in file_choices['kingdom_cond']['include'] + + + file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include'] + + + file_choices['kingdom_cond']['include'] and "5p-utr" in file_choices['kingdom_cond']['include'] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+ + + +
+ + + + + + + +
+ + + + + + + + + + +
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + +
+ + + +
+ + + + + + + + +
+ + + + + + + + + + +
+ + + +
+ + + + + + + +
+ + + + + + + + + + +
+ + + +
+ + + + + + + +
+ + + + + + + + + + + +
+ + + + +
+ + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+ + + +
+ + + + + + + +
+ + + + + + +
+ + + +
+
+ + + + +
+ + + + + + + + + + + +
+
+ + + +
diff -r a3395b1d871b -r ac24fff14f23 datasets_genome.xml --- a/datasets_genome.xml Mon Nov 21 11:40:05 2022 +0000 +++ b/datasets_genome.xml Fri Dec 02 10:52:48 2022 +0000 @@ -5,16 +5,17 @@ ncbi_dataset.txt +--dehydrated + +## produce TSV report file +&& dataformat tsv genome + --package ncbi_dataset.zip + --fields #echo ",".join($file_choices.report_columns) + > genome_data_report.tsv + +## unzip and rehydrate if any data is to be downloaded (include is not None) +#if $file_choices.include + ## unzip + && 7z x -y ncbi_dataset.zip > 7z.log + + ## rehydrate + && datasets rehydrate + --directory ./ + #if not $file_choices.decompress + --gzip + #end if + --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10} + + ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery + && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \; + + ## unzip all compressed (non-fasta) files (jsonl files are just named .gz) + && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \; + #if $file_choices.decompress + && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \; + #end if + + #if "seq-report" in $file_choices.include + && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \; + #end if + + && true ## because Galaxy removes trailing ; from command #end if ]]>
- - + + - + @@ -67,7 +99,6 @@ - @@ -78,82 +109,93 @@
-
- +
+ + + + + + + + + +
- - - not uncompressed - - - not uncompressed - - - uncompressed - + - - uncompressed and file_choices['include'] and "seq-report" in file_choices['include'] + + file_choices['include'] and "seq-report" in file_choices['include'] - - uncompressed and file_choices['include'] and "genome" in file_choices['include'] + + file_choices['include'] and "genome" in file_choices['include'] - - uncompressed and file_choices['include'] and "rna" in file_choices['include'] + + file_choices['include'] and "rna" in file_choices['include'] - - uncompressed and file_choices['include'] and "protein" in file_choices['include'] + + file_choices['include'] and "protein" in file_choices['include'] - - uncompressed and file_choices['include'] and "cds" in file_choices['include'] + + file_choices['include'] and "cds" in file_choices['include'] - uncompressed and file_choices['include'] and "gff3" in file_choices['include'] + file_choices['include'] and "gff3" in file_choices['include'] - uncompressed and file_choices['include'] and "gtf" in file_choices['include'] + file_choices['include'] and "gtf" in file_choices['include'] - uncompressed and file_choices['include'] and "gbff" in file_choices['include'] + file_choices['include'] and "gbff" in file_choices['include'] - + + - - + - - - +
+ +
+ - + + +
- - + - - +
+ + +
@@ -174,22 +216,24 @@ +
- + - - + - - +
+ + +
@@ -197,6 +241,8 @@ + +
@@ -208,21 +254,41 @@ - - +
+ + +
+ + + + + + + + + + + + + + + + + + - - + +
@@ -233,12 +299,17 @@ - - +
+ + +
- + + + + @@ -250,7 +321,9 @@
- + + + @@ -258,32 +331,14 @@ - - - - - - - - - - - - - - - - - - - - - - - - +
+ +
+
@@ -293,11 +348,64 @@ - - +
+ + +
+ + + + + + + + + + + + + + +
+ + + + + + + + + +
+ +
+ + + + + + + + + + + + + + + + + + + + + + +
@@ -307,8 +415,11 @@ - - +
+ + +
+ @@ -320,12 +431,10 @@ - - + - @@ -338,15 +447,18 @@ **Download Genome Datasets from NCBI** Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report. -Genome datasets can be specified by NCBI Assembly or BioProject accession or taxon. Datasets are downloaded as a zip file. +Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon. + +The download is a three step process: -Tthe default genome dataset includes the following files (if available): - * data_report.jsonl (genome assembly and annotation metadata, not always available) - * genomic.fna (genomic sequences) - * rna.fna (transcript sequences) - * protein.faa (protein sequences) - * genomic.gff (genome annotation in gff3 format) - * dataset_catalog.json (a list of files and file types included in the dataset) +1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL) +2. The metadata is transformed into a tabular (TSV) file +3. The data is hydrated (the actual data is downloaded) + +The 3rd step can be skipped by unselecting all output types in the `Include` parameter. +Thereby its possible to inspect the metadata prior to the actual data download. Also this +allows to use the tool for querying data sets (and their accessions) of interest which +can then be downloaded in a second call using the accessions. ]]> diff -r a3395b1d871b -r ac24fff14f23 macros.xml --- a/macros.xml Mon Nov 21 11:40:05 2022 +0000 +++ b/macros.xml Fri Dec 02 10:52:48 2022 +0000 @@ -1,5 +1,5 @@ - 14.3 + 14.4 0 21.01 MIT @@ -39,8 +39,10 @@ - - + + + @@ -59,18 +61,45 @@ - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [0-9]{2}/[0-9]{2}/[0-9]{4} @@ -93,9 +303,9 @@ #end if - + - + diff -r a3395b1d871b -r ac24fff14f23 test-data/geneids.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/geneids.txt Fri Dec 02 10:52:48 2022 +0000 @@ -0,0 +1,2 @@ +2597 +14433