diff ncbi_acc_download.xml @ 0:1c58de56d587 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_acc_download commit 6747338e8e02cb87c4f3b9cdea0b761f236a02d1"
author iuc
date Wed, 04 Dec 2019 07:01:37 -0500
parents
children e063168e0a81
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ncbi_acc_download.xml	Wed Dec 04 07:01:37 2019 -0500
@@ -0,0 +1,313 @@
+<tool id="ncbi_acc_download" name="NCBI Accession Download" version="@TOOL_VERSION@+galaxy0">
+    <description>Download sequences from GenBank/RefSeq by accession through the NCBI ENTREZ API</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">ncbi-acc-download</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        #if $query_source.select == "accession_file":
+            { grep -v "^[ \t]*$" $query_source.accession_file > accessions ||
+            { echo "No accession numbers in input. Aborting." 1>&2; exit 1; } } &&
+        #else if $query_source.select == "accession_list":
+            echo '$query_source.accession_list' | sed -r 's/(\,|__cn__)/\n/g' | grep -v "^[ \t]*$" > accessions &&
+        #end if
+        mkdir outdir &&
+        cd outdir &&
+        ignore_errors=$ignore_failed &&
+        while read accession; do
+        echo "Downloading accession number: " \$accession " ..." >> ../error.log &&
+        ncbi-acc-download
+            --molecule '${molecule.select}'
+            --format '${molecule.format}'
+            #if $molecule.format != 'featuretable' and $molecule.format != 'gff3':
+                --extended-validation all
+            #end if
+            \${accession};
+        failure=\$?;
+        if [ \$failure -ne 0 ]; then
+            echo " failed." >> ../error.log;
+            if [ \$ignore_errors -ne 0 ]; then
+                echo \$accession >> ../failed.txt;
+            else
+                exit 1;
+            fi;
+        else
+            echo " done." >> ../error.log;
+        fi;
+        sleep 2;
+        done < ../accessions 2> >(tee -a ../error.log >&2);
+    ]]></command>
+    <inputs>
+        <conditional name="query_source">
+            <param name="select" type="select" label="Select source for IDs">
+                <option value="accession_file">File containing Accessions (one per line)</option>
+                <option value="accession_list">Direct Entry</option>
+            </param>
+            <when value="accession_file">
+                <param label="Accession File" name="accession_file" type="data" format="txt,tabular"/>
+            </when>
+            <when value="accession_list">
+                <param label="ID List" name="accession_list" type="text" area="true" help="Newline/Comma separated list of IDs">
+                    <validator type="expression" message="ID list cannot be empty">value.strip()</validator>
+                </param>
+            </when>
+        </conditional>
+        <conditional name="molecule">
+            <param name="select" type="select" label="Molecule Type">
+                <option value="nucleotide" selected="true">Nucleotide</option>
+                <option value="protein">Protein</option>
+            </param>
+            <when value="nucleotide">
+                <param name="format" type="select" label="File Format">
+                    <option value="fasta" selected="true">FASTA</option>
+                    <option value="genbank">GenBank</option>
+                    <option value="featuretable">Feature Table</option>
+                    <option value="gff3">GFF3</option>
+                </param>
+            </when>
+            <when value="protein">
+                <param name="format" type="select" label="File Format">
+                    <option value="fasta" selected="true">FASTA</option>
+                </param>
+            </when>
+        </conditional>
+        <param name="ignore_failed" type="select" display="radio"
+        label="How to handle download failures">
+            <option value="0">Abort with error on first failure</option>
+            <option value="1">Add accession to failed list and continue</option>
+        </param>
+    </inputs>
+    <outputs>
+        <collection name="output" type="list" label="${tool.name} on ${on_string}: Downloaded Files">
+            <discover_datasets pattern="(?P&lt;name&gt;.+)\.fa$" directory="outdir" format="fasta"/>
+            <discover_datasets pattern="(?P&lt;name&gt;.+)\.gbk$" directory="outdir" format="genbank"/>
+            <discover_datasets pattern="(?P&lt;name&gt;.+)\.gff$" directory="outdir" format="gff"/>
+            <discover_datasets pattern="(?P&lt;name&gt;.+)\.ft$" directory="outdir" format="txt"/>
+        </collection>
+        <data name="error_log" from_work_dir="error.log" label="${tool.name} on ${on_string}: Log" format="txt"/>
+        <data name="failed_accessions" from_work_dir="failed.txt" label="${tool.name} on ${on_string}: Failed accessions" format="txt">
+            <filter>str(ignore_failed)=='1'</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <conditional name="molecule">
+                <param name="select" value="nucleotide"/>
+                <param name="format" value="fasta"/>
+            </conditional>
+            <conditional name="query_source">
+                <param name="select" value="accession_file" />
+                <param name="accession_file" value="accessions_1.tsv"/>
+            </conditional>
+            <output_collection name="output" type="list">
+                <element name="CP011064" ftype="fasta">
+                    <assert_contents>
+                        <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" />
+                    </assert_contents>
+                </element>
+                <element name="CP021680" ftype="fasta">
+                    <assert_contents>
+                        <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <conditional name="molecule">
+                <param name="select" value="nucleotide"/>
+                <param name="format" value="genbank"/>
+            </conditional>
+            <conditional name="query_source">
+                <param name="select" value="accession_file" />
+                <param name="accession_file" value="accessions_1.tsv"/>
+            </conditional>
+            <output_collection name="output" type="list">
+                <element name="CP011064" ftype="genbank">
+                    <assert_contents>
+                        <has_line line="DEFINITION  Escherichia coli str. Sanji plasmid pSJ_94, complete sequence." />
+                    </assert_contents>
+                </element>
+                <element name="CP021680" ftype="genbank">
+                    <assert_contents>
+                        <has_line line="DEFINITION  Escherichia coli strain AR_0162 plasmid tig00002623, complete" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <conditional name="molecule">
+                <param name="select" value="nucleotide"/>
+                <param name="format" value="gff3"/>
+            </conditional>
+            <conditional name="query_source">
+                <param name="select" value="accession_file" />
+                <param name="accession_file" value="accessions_1.tsv"/>
+            </conditional>
+            <output_collection name="output" type="list">
+                <element name="CP011064" ftype="gff">
+                    <assert_contents>
+                        <has_line line="##sequence-region CP011064.1 1 94712" />
+                    </assert_contents>
+                </element>
+                <element name="CP021680" ftype="gff">
+                    <assert_contents>
+                        <has_line line="##sequence-region CP021680.1 1 23332" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <conditional name="molecule">
+                <param name="select" value="nucleotide"/>
+                <param name="format" value="featuretable"/>
+            </conditional>
+            <conditional name="query_source">
+                <param name="select" value="accession_file" />
+                <param name="accession_file" value="accessions_1.tsv"/>
+            </conditional>
+            <output_collection name="output" type="list">
+                <element name="CP011064" ftype="txt">
+                    <assert_contents>
+                        <has_line line=">Feature gb|CP011064.1|" />
+                    </assert_contents>
+                </element>
+                <element name="CP021680" ftype="txt">
+                    <assert_contents>
+                        <has_line line=">Feature gb|CP021680.1|" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <conditional name="molecule">
+                <param name="select" value="nucleotide"/>
+                <param name="format" value="fasta"/>
+            </conditional>
+            <conditional name="query_source">
+                <param name="select" value="accession_list" />
+                <param name="accession_list" value="CP011064,CP021680"/>
+            </conditional>
+            <output_collection name="output" type="list">
+                <element name="CP011064" ftype="fasta">
+                    <assert_contents>
+                        <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" />
+                    </assert_contents>
+                </element>
+                <element name="CP021680" ftype="fasta">
+                    <assert_contents>
+                        <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <conditional name="molecule">
+                <param name="select" value="nucleotide"/>
+                <param name="format" value="fasta"/>
+            </conditional>
+            <conditional name="query_source">
+                <param name="select" value="accession_list" />
+                <param name="accession_list" value="CP011064,CP0XXXXX,CP021680"/>
+            </conditional>
+            <param name="ignore_failed" value="1" />
+            <output_collection name="output" type="list">
+                <element name="CP011064" ftype="fasta">
+                    <assert_contents>
+                        <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" />
+                    </assert_contents>
+                </element>
+                <element name="CP021680" ftype="fasta">
+                    <assert_contents>
+                        <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output name="failed_accessions">
+                <assert_contents>
+                    <has_line line="CP0XXXXX" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <conditional name="molecule">
+                <param name="select" value="nucleotide"/>
+                <param name="format" value="fasta"/>
+            </conditional>
+            <conditional name="query_source">
+                <param name="select" value="accession_list" />
+                <param name="accession_list" value="CP0XXXXX"/>
+            </conditional>
+            <param name="ignore_failed" value="1" />
+            <output name="failed_accessions">
+                <assert_contents>
+                    <has_line line="CP0XXXXX" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_failure="true">
+            <conditional name="molecule">
+                <param name="select" value="nucleotide"/>
+                <param name="format" value="fasta"/>
+            </conditional>
+            <conditional name="query_source">
+                <param name="select" value="accession_list" />
+                <param name="accession_list" value="CP011064,CP0XXXXX,CP021680"/>
+            </conditional>
+            <param name="ignore_failed" value="0" />
+        </test>
+        <test>
+            <conditional name="molecule">
+                <param name="select" value="nucleotide"/>
+                <param name="format" value="fasta"/>
+            </conditional>
+            <conditional name="query_source">
+                <param name="select" value="accession_list" />
+                <param name="accession_list" value="CP011064&#10;CP021680"/>
+            </conditional>
+            <output_collection name="output" type="list">
+                <element name="CP011064" ftype="fasta">
+                    <assert_contents>
+                        <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" />
+                    </assert_contents>
+                </element>
+                <element name="CP021680" ftype="fasta">
+                    <assert_contents>
+                        <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <conditional name="molecule">
+                <param name="select" value="protein"/>
+                <param name="format" value="fasta"/>
+            </conditional>
+            <conditional name="query_source">
+                <param name="select" value="accession_list" />
+                <param name="accession_list" value="NP_003192"/>
+            </conditional>
+            <output_collection name="output" type="list">
+                <element name="NP_003192" ftype="fasta">
+                    <assert_contents>
+                        <has_line line=">NP_003192.1 transcription factor A, mitochondrial isoform 1 precursor [Homo sapiens]" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+Given a file containing a list of NCBI accession numbers or a direct entry of accession numbers in the tool text input box, this tool will download the corresponding sequence records via the NCBI API. 
+
+**Limitations**
+- For protein sequence downloads, only fasta format is supported
+- To avoid rate-limits imposed by the NCBI API, records are downloaded sequentially with a delay between requests. This may make it impractical to use this tool to download many (>100) records.
+
+**Output**
+A collection of sequence records in the desired format.
+    ]]></help>
+    <citations>
+    </citations>
+</tool>