Repository revision
16:a6a475ed58cb

Repository 'ncbi_datasets'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/ncbi_datasets

NCBI Datasets Genomes tool metadata
Miscellaneous
download genome sequence, annotation and metadata
datasets_download_genome
toolshed.g2.bx.psu.edu/repos/iuc/ncbi_datasets/datasets_download_genome/16.20.0+galaxy0
16.20.0+galaxy0
None
True
Version lineage of this tool (guids ordered most recent to oldest)
toolshed.g2.bx.psu.edu/repos/iuc/ncbi_datasets/datasets_download_genome/16.20.0+galaxy0 (this tool)
toolshed.g2.bx.psu.edu/repos/iuc/ncbi_datasets/datasets_download_genome/12.27.1
datasets_download_genome
Requirements (dependencies defined in the <requirements> tag set)
name version type
ncbi-datasets-cli 16.20.0 package
ca-certificates 2024.2.2 package
p7zip 16.02 package
Additional information about this tool
#import re

## If running in a container use certificates from ca-certificates instead of outdated/missing container certificates
[ -f /usr/local/ssl/cacert.pem ] && export SSL_CERT_FILE="/usr/local/ssl/cacert.pem";
        
datasets download genome $query.subcommand.download_by
#if $query.subcommand.download_by == 'accession':
    #if $query.subcommand.text_or_file.text_or_file == 'text':
        #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)
    #else
        --inputfile '$query.subcommand.text_or_file.inputfile'
    #end if
#else:
    '$query.subcommand.taxon_positional'
    $query.subcommand.tax_exact_match
#end if
$filters.reference
$filters.annotated
#if $filters.assembly_level:
    --assembly-level $filters.assembly_level
#end if
--assembly-version $filters.assembly_version
#if $filters.assembly_source:
    --assembly-source $filters.assembly_source
#end if
#if $filters.chromosomes:
    --chromosomes '$filters.chromosomes'
#end if
$filters.exclude_atypical
#if $filters.mag:
    --mag '$filters.mag'
#end if


        --include
        #if $file_choices.include
            #echo ",".join($file_choices.include)
        #else
            none
        #end if
    
#if $filters.released_before:
--released-before '$filters.released_before'
#end if
    
#if $filters.released_after:
--released-after '$filters.released_after'
#end if
    
#for search_term in $filters.search:
    --search '$filters.search_term'
#end for
--no-progressbar
--dehydrated

## produce TSV report file
&& dataformat tsv genome 
    --package ncbi_dataset.zip
    --fields #echo ",".join($file_choices.report_columns) 
    > genome_data_report.tsv

## unzip and rehydrate if any data is to be downloaded (include is not None)
#if $file_choices.include
    ## unzip
    && 7z x -y ncbi_dataset.zip > 7z.log

    ## rehydrate
    && datasets rehydrate
        --directory ./
        #if not $file_choices.decompress
            --gzip
        #end if
        --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10}

    ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery
    && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;

    ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
    ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called we --gzip)
    ##      in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression
    && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
    #if not $file_choices.decompress
        && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
    #end if

    #if "seq-report" in $file_choices.include
        && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \;
    #end if
    
    && true  ## because Galaxy removes trailing ; from command
#end if
None
False
Functional tests
name inputs outputs required files
Test-1 query|subcommand|taxon_positional: human
query|subcommand|download_by: taxon
filters|chromosomes: 21
filters|released_before: 01/01/2018
file_choices|include: ['rna', 'gff3']
name: value
value
Test-2 query|subcommand|taxon_positional: human
query|subcommand|download_by: taxon
filters|assembly_level: ['chromosome', 'complete']
filters|chromosomes: 21
filters|released_before: 01/01/2018
file_choices|include: genome
file_choices|decompress: True
name: value
value
Test-3 query|subcommand|taxon_positional: human
query|subcommand|download_by: taxon
filters|assembly_level: ['chromosome', 'complete']
filters|assembly_source: refseq
filters|chromosomes: 21
filters|released_before: 01/01/2018
file_choices|include: genome
file_choices|decompress: True
Test-4 query|subcommand|text_or_file|accession: GCF_000013305.1 GCF_000007445.1
query|subcommand|text_or_file|text_or_file: text
query|subcommand|download_by: accession
filters|released_before: 01/01/2007
file_choices|include: ['seq-report', 'gtf', 'cds']
file_choices|decompress: True
name: value
value
Test-5 query|subcommand|text_or_file|inputfile: accessions.txt
query|subcommand|text_or_file|text_or_file: file
query|subcommand|download_by: accession
filters|released_before: 01/01/2007
file_choices|include: ['seq-report', 'gff3', 'gbff']
file_choices|decompress: True
name: value
accessions.txt
value
Test-6 query|subcommand|text_or_file|accession: GCF_000001405
query|subcommand|text_or_file|text_or_file: text
query|subcommand|download_by: accession
filters|assembly_version: all
filters|released_before: 01/01/2015
file_choices|include: seq-report
name: value
value
Test-7 query|subcommand|text_or_file|accession: GCF_000146045.2
query|subcommand|text_or_file|text_or_file: text
query|subcommand|download_by: accession
file_choices|include: ['genome', 'protein', 'rna', 'cds']
file_choices|decompress: True
Test-8 query|subcommand|text_or_file|accession: GCF_000146045.2
query|subcommand|text_or_file|text_or_file: text
query|subcommand|download_by: accession
file_choices|include: ['genome', 'protein', 'rna', 'cds']
Test-9 query|subcommand|text_or_file|accession: GCF_000146045.2 GCF_000002945.1
query|subcommand|text_or_file|text_or_file: text
query|subcommand|download_by: accession
file_choices|include: ['seq-report', 'genome']
file_choices|decompress: True
Test-10 query|subcommand|taxon_positional: 4932
query|subcommand|tax_exact_match: True
query|subcommand|download_by: taxon
file_choices|include:
name: value
value