Mercurial > repos > iuc > ncbi_acc_download
diff ncbi_acc_download.xml @ 0:1c58de56d587 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_acc_download commit 6747338e8e02cb87c4f3b9cdea0b761f236a02d1"
author | iuc |
---|---|
date | Wed, 04 Dec 2019 07:01:37 -0500 |
parents | |
children | e063168e0a81 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ncbi_acc_download.xml Wed Dec 04 07:01:37 2019 -0500 @@ -0,0 +1,313 @@ +<tool id="ncbi_acc_download" name="NCBI Accession Download" version="@TOOL_VERSION@+galaxy0"> + <description>Download sequences from GenBank/RefSeq by accession through the NCBI ENTREZ API</description> + <macros> + <import>macros.xml</import> + </macros> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">ncbi-acc-download</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + #if $query_source.select == "accession_file": + { grep -v "^[ \t]*$" $query_source.accession_file > accessions || + { echo "No accession numbers in input. Aborting." 1>&2; exit 1; } } && + #else if $query_source.select == "accession_list": + echo '$query_source.accession_list' | sed -r 's/(\,|__cn__)/\n/g' | grep -v "^[ \t]*$" > accessions && + #end if + mkdir outdir && + cd outdir && + ignore_errors=$ignore_failed && + while read accession; do + echo "Downloading accession number: " \$accession " ..." >> ../error.log && + ncbi-acc-download + --molecule '${molecule.select}' + --format '${molecule.format}' + #if $molecule.format != 'featuretable' and $molecule.format != 'gff3': + --extended-validation all + #end if + \${accession}; + failure=\$?; + if [ \$failure -ne 0 ]; then + echo " failed." >> ../error.log; + if [ \$ignore_errors -ne 0 ]; then + echo \$accession >> ../failed.txt; + else + exit 1; + fi; + else + echo " done." >> ../error.log; + fi; + sleep 2; + done < ../accessions 2> >(tee -a ../error.log >&2); + ]]></command> + <inputs> + <conditional name="query_source"> + <param name="select" type="select" label="Select source for IDs"> + <option value="accession_file">File containing Accessions (one per line)</option> + <option value="accession_list">Direct Entry</option> + </param> + <when value="accession_file"> + <param label="Accession File" name="accession_file" type="data" format="txt,tabular"/> + </when> + <when value="accession_list"> + <param label="ID List" name="accession_list" type="text" area="true" help="Newline/Comma separated list of IDs"> + <validator type="expression" message="ID list cannot be empty">value.strip()</validator> + </param> + </when> + </conditional> + <conditional name="molecule"> + <param name="select" type="select" label="Molecule Type"> + <option value="nucleotide" selected="true">Nucleotide</option> + <option value="protein">Protein</option> + </param> + <when value="nucleotide"> + <param name="format" type="select" label="File Format"> + <option value="fasta" selected="true">FASTA</option> + <option value="genbank">GenBank</option> + <option value="featuretable">Feature Table</option> + <option value="gff3">GFF3</option> + </param> + </when> + <when value="protein"> + <param name="format" type="select" label="File Format"> + <option value="fasta" selected="true">FASTA</option> + </param> + </when> + </conditional> + <param name="ignore_failed" type="select" display="radio" + label="How to handle download failures"> + <option value="0">Abort with error on first failure</option> + <option value="1">Add accession to failed list and continue</option> + </param> + </inputs> + <outputs> + <collection name="output" type="list" label="${tool.name} on ${on_string}: Downloaded Files"> + <discover_datasets pattern="(?P<name>.+)\.fa$" directory="outdir" format="fasta"/> + <discover_datasets pattern="(?P<name>.+)\.gbk$" directory="outdir" format="genbank"/> + <discover_datasets pattern="(?P<name>.+)\.gff$" directory="outdir" format="gff"/> + <discover_datasets pattern="(?P<name>.+)\.ft$" directory="outdir" format="txt"/> + </collection> + <data name="error_log" from_work_dir="error.log" label="${tool.name} on ${on_string}: Log" format="txt"/> + <data name="failed_accessions" from_work_dir="failed.txt" label="${tool.name} on ${on_string}: Failed accessions" format="txt"> + <filter>str(ignore_failed)=='1'</filter> + </data> + </outputs> + <tests> + <test> + <conditional name="molecule"> + <param name="select" value="nucleotide"/> + <param name="format" value="fasta"/> + </conditional> + <conditional name="query_source"> + <param name="select" value="accession_file" /> + <param name="accession_file" value="accessions_1.tsv"/> + </conditional> + <output_collection name="output" type="list"> + <element name="CP011064" ftype="fasta"> + <assert_contents> + <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" /> + </assert_contents> + </element> + <element name="CP021680" ftype="fasta"> + <assert_contents> + <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" /> + </assert_contents> + </element> + </output_collection> + </test> + <test> + <conditional name="molecule"> + <param name="select" value="nucleotide"/> + <param name="format" value="genbank"/> + </conditional> + <conditional name="query_source"> + <param name="select" value="accession_file" /> + <param name="accession_file" value="accessions_1.tsv"/> + </conditional> + <output_collection name="output" type="list"> + <element name="CP011064" ftype="genbank"> + <assert_contents> + <has_line line="DEFINITION Escherichia coli str. Sanji plasmid pSJ_94, complete sequence." /> + </assert_contents> + </element> + <element name="CP021680" ftype="genbank"> + <assert_contents> + <has_line line="DEFINITION Escherichia coli strain AR_0162 plasmid tig00002623, complete" /> + </assert_contents> + </element> + </output_collection> + </test> + <test> + <conditional name="molecule"> + <param name="select" value="nucleotide"/> + <param name="format" value="gff3"/> + </conditional> + <conditional name="query_source"> + <param name="select" value="accession_file" /> + <param name="accession_file" value="accessions_1.tsv"/> + </conditional> + <output_collection name="output" type="list"> + <element name="CP011064" ftype="gff"> + <assert_contents> + <has_line line="##sequence-region CP011064.1 1 94712" /> + </assert_contents> + </element> + <element name="CP021680" ftype="gff"> + <assert_contents> + <has_line line="##sequence-region CP021680.1 1 23332" /> + </assert_contents> + </element> + </output_collection> + </test> + <test> + <conditional name="molecule"> + <param name="select" value="nucleotide"/> + <param name="format" value="featuretable"/> + </conditional> + <conditional name="query_source"> + <param name="select" value="accession_file" /> + <param name="accession_file" value="accessions_1.tsv"/> + </conditional> + <output_collection name="output" type="list"> + <element name="CP011064" ftype="txt"> + <assert_contents> + <has_line line=">Feature gb|CP011064.1|" /> + </assert_contents> + </element> + <element name="CP021680" ftype="txt"> + <assert_contents> + <has_line line=">Feature gb|CP021680.1|" /> + </assert_contents> + </element> + </output_collection> + </test> + <test> + <conditional name="molecule"> + <param name="select" value="nucleotide"/> + <param name="format" value="fasta"/> + </conditional> + <conditional name="query_source"> + <param name="select" value="accession_list" /> + <param name="accession_list" value="CP011064,CP021680"/> + </conditional> + <output_collection name="output" type="list"> + <element name="CP011064" ftype="fasta"> + <assert_contents> + <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" /> + </assert_contents> + </element> + <element name="CP021680" ftype="fasta"> + <assert_contents> + <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" /> + </assert_contents> + </element> + </output_collection> + </test> + <test> + <conditional name="molecule"> + <param name="select" value="nucleotide"/> + <param name="format" value="fasta"/> + </conditional> + <conditional name="query_source"> + <param name="select" value="accession_list" /> + <param name="accession_list" value="CP011064,CP0XXXXX,CP021680"/> + </conditional> + <param name="ignore_failed" value="1" /> + <output_collection name="output" type="list"> + <element name="CP011064" ftype="fasta"> + <assert_contents> + <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" /> + </assert_contents> + </element> + <element name="CP021680" ftype="fasta"> + <assert_contents> + <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" /> + </assert_contents> + </element> + </output_collection> + <output name="failed_accessions"> + <assert_contents> + <has_line line="CP0XXXXX" /> + </assert_contents> + </output> + </test> + <test> + <conditional name="molecule"> + <param name="select" value="nucleotide"/> + <param name="format" value="fasta"/> + </conditional> + <conditional name="query_source"> + <param name="select" value="accession_list" /> + <param name="accession_list" value="CP0XXXXX"/> + </conditional> + <param name="ignore_failed" value="1" /> + <output name="failed_accessions"> + <assert_contents> + <has_line line="CP0XXXXX" /> + </assert_contents> + </output> + </test> + <test expect_failure="true"> + <conditional name="molecule"> + <param name="select" value="nucleotide"/> + <param name="format" value="fasta"/> + </conditional> + <conditional name="query_source"> + <param name="select" value="accession_list" /> + <param name="accession_list" value="CP011064,CP0XXXXX,CP021680"/> + </conditional> + <param name="ignore_failed" value="0" /> + </test> + <test> + <conditional name="molecule"> + <param name="select" value="nucleotide"/> + <param name="format" value="fasta"/> + </conditional> + <conditional name="query_source"> + <param name="select" value="accession_list" /> + <param name="accession_list" value="CP011064 CP021680"/> + </conditional> + <output_collection name="output" type="list"> + <element name="CP011064" ftype="fasta"> + <assert_contents> + <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" /> + </assert_contents> + </element> + <element name="CP021680" ftype="fasta"> + <assert_contents> + <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" /> + </assert_contents> + </element> + </output_collection> + </test> + <test> + <conditional name="molecule"> + <param name="select" value="protein"/> + <param name="format" value="fasta"/> + </conditional> + <conditional name="query_source"> + <param name="select" value="accession_list" /> + <param name="accession_list" value="NP_003192"/> + </conditional> + <output_collection name="output" type="list"> + <element name="NP_003192" ftype="fasta"> + <assert_contents> + <has_line line=">NP_003192.1 transcription factor A, mitochondrial isoform 1 precursor [Homo sapiens]" /> + </assert_contents> + </element> + </output_collection> + </test> + </tests> + <help><![CDATA[ +**What it does** +Given a file containing a list of NCBI accession numbers or a direct entry of accession numbers in the tool text input box, this tool will download the corresponding sequence records via the NCBI API. + +**Limitations** +- For protein sequence downloads, only fasta format is supported +- To avoid rate-limits imposed by the NCBI API, records are downloaded sequentially with a delay between requests. This may make it impractical to use this tool to download many (>100) records. + +**Output** +A collection of sequence records in the desired format. + ]]></help> + <citations> + </citations> +</tool>