Mercurial > repos > iuc > ncbi_datasets

<tool id="datasets_download_genome" name="NCBI Datasets Genomes" profile="@PROFILE@" license="@LICENSE@" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <description>download genome sequence, annotation and metadata</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="bio_tools"/>
    <expand macro="requirements"></expand>
    <expand macro="version_command"/>
    <command><![CDATA[
#import re
@SETUP_CERTIFICATES@
datasets download genome $query.subcommand.download_by
#if $query.subcommand.download_by == 'accession':
    #if $query.subcommand.text_or_file.text_or_file == 'text':
        #echo " ".join(f"'{x}'" for x in re.split(" |,", str($query.subcommand.text_or_file.accession)) if x)
    #else
        --inputfile '$query.subcommand.text_or_file.inputfile'
    #end if
#else:
    '$query.subcommand.taxon_positional'
    $query.subcommand.tax_exact_match
#end if
$filters.reference
$filters.annotated
#if $filters.assembly_level:
    --assembly-level $filters.assembly_level
#end if
--assembly-version $filters.assembly_version
#if $filters.assembly_source:
    --assembly-source $filters.assembly_source
#end if
#if $filters.chromosomes:
    --chromosomes '$filters.chromosomes'
#end if
$filters.exclude_atypical
#if $filters.mag:
    --mag '$filters.mag'
#end if

@INCLUDE@
@RELEASED_BEFORE@
@RELEASED_AFTER@
#for search_term in $filters.search:
    --search '$filters.search_term'
#end for
--no-progressbar
--dehydrated

## produce TSV report file
&& dataformat tsv genome
    --package ncbi_dataset.zip
    --fields #echo ",".join($file_choices.report_columns)
    > genome_data_report.tsv

## unzip and rehydrate if any data is to be downloaded (include is not None)
#if $file_choices.include
    ## unzip
    && 7z x -y ncbi_dataset.zip > 7z.log

    ## rehydrate
    && datasets rehydrate
        --directory ./
        #if not $file_choices.decompress
            --gzip
        #end if
        --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10}

    ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery
    && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;

    ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
    ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called we --gzip)
    ##      in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression
    && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
    #if not $file_choices.decompress
        && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
    #end if

    #if "seq-report" in $file_choices.include
        && find ncbi_dataset -name sequence_report.jsonl -exec sh -c 'dataformat tsv genome-seq --inputfile {} > \$(dirname {})/\$(basename {} .jsonl).tsv' \;
    #end if

    && true  ## because Galaxy removes trailing ; from command
#end if
]]></command>
    <inputs>
        <section name="query" title="Query" expanded="true">
            <conditional name="subcommand">
                <param name="download_by" type="select" label="Choose how to find genomes to download">
                    <option value="accession">By NCBI assembly or BioProject accession</option>
                    <option value="taxon">By taxon (NCBI Taxonomy ID, scientific or common name at any tax rank)</option>
                </param>
                <when value="accession">
                    <expand macro="text_or_file"/>
                </when>
                <when value="taxon">
                    <expand macro="taxon_positional"/>
                    <param argument="--tax-exact-match" type="boolean" truevalue="--tax-exact-match" falsevalue="" label="Exclude sub-species when a species-level taxon is specified"/>
                </when>
            </conditional>
        </section>
        <section name="filters" title="Filters and Limit">
            <param argument="--reference" type="boolean" truevalue="--reference" falsevalue="" label="Limit to reference and representative (GCF_ and GCA_) assemblies"/>
            <expand macro="annotation"/>
            <expand macro="assembly_level"/>
            <param argument="--assembly-version" type="select" label="Assembly version(s)">
                <option value="latest">Latest</option>
                <option value="all">All</option>
            </param>
            <expand macro="assembly_source"/>
            <expand macro="chromosomes"/>
            <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/>
            <param argument="--mag" type="select" multiple="false" optional="true" label="Filter metagenome assembled genomes (MAGs)">
                <option value="only" selected="false">Limit to MAGs</option>
                <option value="exclude" selected="false">Exclude MAGs</option>
            </param>
            <expand macro="released_options"/>
            <expand macro="released_options" before_or_after="after"/>

            <repeat name="search" title="Add search terms">
                <param argument="--search" type="text" label="Only include genomes that have the specified text in the searchable fields" help="Searchable fields are species and infraspecies, assembly name and submitter"/>
            </repeat>
        </section>
        <section name="file_choices" title="Output options" expanded="true">
            <expand macro="tsv_report_columns">
                <option value="accession" selected="true">accession</option>
                <option value="organism-name" selected="true">organism-name</option>
                <option value="assminfo-submitter" selected="true">assminfo-submitter</option>
                <option value="assminfo-name" selected="true">assminfo-name</option>
            </expand>
            <expand macro="include">
                <expand macro="genome_includes"/>
            </expand>
            <param name="decompress" type="boolean" label="Decompress FASTA" help="By default FASTA files are provided zipped (fasta.gz) if this is checked the data will be decompressed"/>
        </section>
    </inputs>
    <outputs>
        <data name="genome_data_report" format="tabular" label="NCBI Genome Datasets: Data Report" from_work_dir="genome_data_report.tsv"/>
        <collection name="sequence_report" label="NCBI Genome Datasets: Sequence Data Report" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/sequence_report.tsv" ext="tabular" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "seq-report" in file_choices['include']</filter>
        </collection>
        <collection name="genome_fasta" label="NCBI Genome Datasets: genome fasta" type="list:list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)/(?!rna|cds_from)(?P&lt;identifier_1&gt;.*?)(_genomic)?\.(?P&lt;ext&gt;fasta(\.gz)?)"  directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "genome" in file_choices['include']</filter>
        </collection>
        <collection name="rna_fasta" label="NCBI Genome Datasets: RNA fasta" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/rna\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "rna" in file_choices['include']</filter>
        </collection>
        <collection name="protein_fasta" label="NCBI Genome Datasets: protein fasta" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/protein\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "protein" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_cds" label="NCBI Genome Datasets: genomic cds fasta" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/cds_from_genomic\.(?P&lt;ext&gt;fasta(\.gz)?)$" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "cds" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_gff" label="NCBI Genome Datasets: genomic gff3" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gff" ext="gff3" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "gff3" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_gtf" label="NCBI Genome Datasets: gtf" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gtf" ext="gtf" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "gtf" in file_choices['include']</filter>
        </collection>
        <collection name="genomic_gbff" label="NCBI Genome Datasets: GenBank flatfile" type="list">
            <discover_datasets pattern="(?P&lt;identifier_0&gt;.*?)\/genomic\.gbff" ext="txt" directory="ncbi_dataset/data" recurse="true" match_relative_path="true"/>
            <filter>file_choices['include'] and "gbff" in file_choices['include']</filter>
        </collection>
    </outputs>
    <tests>
        <test expect_num_outputs="3">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="human"/>
            </conditional>
            <param name="chromosomes" value="21"/>
            <param name="released_before" value="01/01/2018"/>
            <section name="file_choices">
                <!-- include a sequence (which should be downloaded as fasta.gz)
                     and one non-sequence (which should be decompressed) output -->
                <param name="include" value="rna,gff3"/>
            </section>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="Assembly Accession&#009;Assembly Name&#009;Assembly Submitter&#009;Organism Name"/>
                    <has_n_lines n="142"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
            <output_collection name="rna_fasta" type="list" count="1">
                <element name="GCF_000306695.2" decompress="true">
                    <assert_contents>
                        <has_text text=">"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="genomic_gff" type="list">
                <element name="GCF_000306695.2">
                    <assert_contents>
                        <has_n_lines min="1000000"/>
                        <has_line line="##gff-version 3"/>
                        <has_n_columns n="9" comment="#"/>
                    </assert_contents>
                </element>
            </output_collection>
            <assert_command>
                <has_text text="gunzip"/>
            </assert_command>
        </test>
        <test expect_num_outputs="2">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="human"/>
            </conditional>
            <param name="chromosomes" value="21"/>
            <param name="assembly_level" value="chromosome,complete"/>
            <param name="released_before" value="01/01/2018"/>
            <section name="file_choices">
                <param name="include" value="genome"/>
                <param name="decompress" value="true"/>
            </section>
            <output_collection name="genome_fasta" type="list:list" count="12">
                <expand macro="genome_fasta_assert" el1="GCA_000002115.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000002125.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000212995.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000252825.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000306695.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_000365445.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_001292825.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_001524155.4" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_001712695.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCA_022833125.2" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
                <!-- According to  https://github.com/ncbi/datasets/issues/188, the following should not be included among the returned results anymore 09/2023 -->
                <!--
                <expand macro="genome_fasta_assert" el1="GCA_000442335.2" el2="GCA_000442335.2_LinearCen1.1_normalized" expression=">" expression_n="25"/>
                <expand macro="genome_fasta_assert" el1="GCA_000002135.3" el2="GCA_000002135.3_CRA_TCAGchr7v2" expression=">"/>
                -->
            </output_collection>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="Homo sapiens"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
        </test>
        <!-- same as previous test but assembly_source=refseq, which removes all of the genomes -->
        <test expect_failure="true">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="human"/>
            </conditional>
            <param name="chromosomes" value="21"/>
            <param name="assembly_level" value="chromosome,complete"/>
            <param name="assembly_source" value="refseq"/>
            <param name="released_before" value="01/01/2018"/>
            <section name="file_choices">
                <param name="include" value="genome"/>
                <param name="decompress" value="true"/>
            </section>
            <assert_stderr>
                <has_text text="no genome assemblies were found"/>
            </assert_stderr>
            <!-- In the current state of the NCBI tool/DB, no output to check.
             But the returned results seem to change from time to time and it might
             be necessary to re-enable this code block if the test fails in the future. -->
            <!--
            <output_collection name="genome_fasta" type="list:list" count="2">
                <expand macro="genome_fasta_assert" el1="GCF_000002125.1" el2="chr21" expression=">"/>
                <expand macro="genome_fasta_assert" el1="GCF_000306695.2" el2="chr21" expression=">"/>
            </output_collection>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="Homo sapiens"/>
                    <has_n_lines n="5"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output> -->
        </test>
        <test expect_num_outputs="4">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000013305.1 GCF_000007445.1"/>
                </conditional>
            </conditional>
            <param name="released_before" value="01/01/2007"/>
            <section name="file_choices">
                <param name="include" value="seq-report,gtf,cds"/>
                <param name="decompress" value="true"/>
            </section>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="GCF_000013305.1"/>
                    <has_n_lines n="3"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
            <output_collection name="sequence_report" type="list" count="2" >
                <element name="GCF_000007445.1">
                    <assert_contents>
                        <has_text text="GCF_000007445.1"/>
                        <has_n_lines n="2"/>
                        <has_n_columns n="15"/>
                    </assert_contents>
                </element>
                <element name="GCF_000013305.1">
                    <assert_contents>
                        <has_text text="GCF_000013305.1"/>
                        <has_n_lines n="2"/>
                        <has_n_columns n="15"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="genomic_gtf" type="list">
                <element name="GCF_000007445.1" file="GCF_000007445.1.genomic.gtf" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.gtf" compare="contains"/>
            </output_collection>
            <output_collection name="genomic_cds" type="list">
                <element name="GCF_000007445.1" file="genome.2.GCF_000007445.1.genomic.cds" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.2.GCF_000013305.1.genomic.cds" compare="contains"/>
            </output_collection>
        </test>
        <test expect_num_outputs="4">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="file"/>
                    <param name="inputfile" value="accessions.txt"/>
                </conditional>
            </conditional>
            <param name="released_before" value="01/01/2007"/>
            <section name="file_choices">
                <param name="include" value="seq-report,gff3,gbff"/>
                <param name="decompress" value="true"/>
            </section>
            <output name="genome_data_report">
                <assert_contents>
                    <has_text text="GCF_000013305.1"/>
                    <has_text text="GCF_000007445.1"/>
                    <has_n_lines n="3"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
            <output_collection name="genomic_gff" type="list">
                <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gff" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gff" compare="contains"/>
            </output_collection>
            <output_collection name="genomic_gbff" type="list">
                <element name="GCF_000007445.1" file="genome.3.GCF_000007445.1.genomic.gbff" compare="contains"/>
                <element name="GCF_000013305.1" file="genome.3.GCF_000013305.1.genomic.gbff" compare="contains"/>
            </output_collection>
        </test>

        <!-- should not fail https://github.com/ncbi/datasets/issues/194 -->
        <test expect_num_outputs="2">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000001405"/>
                </conditional>
            </conditional>
            <param name="released_before" value="01/01/2015"/>
            <param name="assembly_version" value="all"/>
            <section name="file_choices">
                <param name="include" value="seq-report"/>
            </section>
            <output name="genome_data_report">
                <!-- assert that we get at least the 16 versions available at the time of writing this test -->
                <assert_contents>
                    <has_text text="GCF_000001405" min="16"/>
                    <has_n_lines min="16"/>
                    <has_n_columns n="4"/>
                </assert_contents>
            </output>
            <!--not testing the collection output. the count will change over time
                and this can't be tested for at the moment
                <output_collection name="sequence_report" type="list" count="16"/> -->
        </test>
        <test expect_num_outputs="5">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000146045.2"/>
                </conditional>
            </conditional>
            <section name="file_choices">
                <param name="include" value="genome,protein,rna,cds"/>
                <param name="decompress" value="true"/>
            </section>
            <output_collection name="genome_fasta" type="list:list" count="1">
                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
            </output_collection>
            <output_collection name="protein_fasta" type="list" count="1">
                <element name="GCF_000146045.2" decompress="true">
                    <assert_contents>
                        <has_text text=">"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="rna_fasta" type="list" count="1">
                <element name="GCF_000146045.2" decompress="true">
                    <assert_contents>
                        <has_text text=">"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <!-- same as the previous test, but use the default value for decompress,
            see comment at the beginning of the tests -->
        <test expect_num_outputs="5">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000146045.2"/>
                </conditional>
            </conditional>
            <section name="file_choices">
                <param name="include" value="genome,protein,rna,cds"/>
            </section>
            <output_collection name="genome_fasta" type="list:list" count="1">
                <element name="GCF_000146045.2">
                    <element name="GCF_000146045.2_R64" ftype="fasta.gz">
                        <assert_contents>
                            <has_size value="3843460" delta="2000"/>
                        </assert_contents>
                    </element>
                </element>
            </output_collection>
            <output_collection name="protein_fasta" type="list" count="1">
                <element name="GCF_000146045.2" ftype="fasta.gz">
                    <assert_contents>
                        <has_size value="1845038" delta="2000"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="rna_fasta" type="list" count="1">
                <element name="GCF_000146045.2" ftype="fasta.gz">
                    <assert_contents>
                        <has_size min="2700000" max="2800000"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="3">
            <conditional name="query|subcommand">
                <param name="download_by" value="accession"/>
                <conditional name="text_or_file">
                    <param name="text_or_file" value="text"/>
                    <param name="accession" value="GCF_000146045.2 GCF_000002945.1"/>
                </conditional>
            </conditional>
            <section name="file_choices">
                <param name="include" value="seq-report,genome"/>
                <param name="decompress" value="true"/>
            </section>
            <output_collection name="sequence_report" type="list" count="2"/>
            <output_collection name="genome_fasta" type="list:list" count="2">
                <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/>
                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
            </output_collection>
        </test>
        <!-- tax_exact_match should filter out strains
             https://github.com/ncbi/datasets/issues/187 -->
        <test expect_num_outputs="1">
            <conditional name="query|subcommand">
                <param name="download_by" value="taxon"/>
                <param name="taxon_positional" value="4932"/>
                <param name="tax_exact_match" value="true"/>
            </conditional>
            <param name="include" value=""/>
            <output name="genome_data_report">
                <assert_contents>
                   <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help>
<![CDATA[
**Download Genome Datasets from NCBI**

Download a genome dataset including genome, transcript and protein sequence, annotation and a detailed data report.
Genome datasets can be specified by NCBI Assembly or BioProject accession(s) or by taxon.

The download is a three step process:

1. A "dehydrated" zip file is downloaded which includes the metadata and the download URL)
2. The metadata is transformed into a tabular (TSV) file
3. The data is hydrated (the actual data is downloaded)

The 3rd step can be skipped by unselecting all output types in the `Include` parameter.
Thereby its possible to inspect the metadata prior to the actual data download. Also this
allows to use the tool for querying data sets (and their accessions) of interest which
can then be downloaded in a second call using the accessions.
]]>
    </help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Fri, 07 Jun 2024 14:19:58 +0000
parents	198c75abbf55
children