changeset 15:f5ea3ce9b9b0 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sra-tools commit fe3f54a0d3edb83fcf6752e3b1524c582b4febd5"
author iuc
date Tue, 10 Sep 2019 11:35:35 -0400
parents 1790dcf3c32d
children aad3885b3216
files fasterq_dump.xml fastq_dump.xml sra_macros.xml sra_pileup.xml test-data/ERR086330_1.fastq.gz test-data/ERR086330_2.fastq.gz test-data/SRR002702_1.fastq.gz test-data/SRR002702_2.fastq.gz test-data/SRR522874.fastq.gz test-data/SRR522874.sra test-data/SRR522874.sra_1.fastq.gz test-data/SRR522874.sra_2.fastq.gz test-data/SRR522874.sra_3.fastq.gz test-data/SRR522874.sra_4.fastq.gz test-data/SRR522874_1.fastq.gz test-data/SRR522874_2.fastq.gz test-data/SRR6982805.fastqsanger.gz test-data/list_sra
diffstat 18 files changed, 261 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasterq_dump.xml	Tue Sep 10 11:35:35 2019 -0400
@@ -0,0 +1,238 @@
+<tool id="fasterq_dump" name="Faster Download and Extract Reads in FASTQ" version="@VERSION@.4" profile="16.01">
+    <description>format from NCBI SRA</description>
+    <macros>
+        <import>sra_macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <version_command>fasterq-dump --version</version_command>
+    <command detect_errors="exit_code"><![CDATA[
+
+    @SET_ACCESSIONS@
+    #if $input.input_select == "file":
+        acc='${input.file.name}' &&
+        ln -s '${input.file}' "\$acc" &&
+    #end if
+
+    fasterq-dump "\$acc" -e \${GALAXY_SLOTS:-1}
+    $adv.split
+    #if str( $adv.minlen ) != "":
+        --min-read-len "$adv.minlen"
+    #end if
+    $adv.skip_technical >> $log 2>&1 
+    &&
+    mkdir -p output &&
+    mkdir -p outputOther &&
+    count=`ls *.fastq | wc -l` &&
+    echo "There are \$count fastq" &&
+    data=(\$(ls *.fastq)) &&
+    if [ "\$count" -eq 1 ]; then
+        gzip -c "\${data[0]}" > output/"\${acc}"__single.fastqsanger.gz &&
+        rm "\${data[0]}";
+    elif [ "$adv.split" = "--split-3" ]; then
+        if [ -e "\${acc}".fastq ]; then
+            gzip -c "\${acc}".fastq > outputOther/"\${acc}"__single.fastqsanger.gz;
+        fi &&
+        gzip -c "\${acc}"_1.fastq > output/"\${acc}"_forward.fastqsanger.gz &&
+        gzip -c "\${acc}"_2.fastq > output/"\${acc}"_reverse.fastqsanger.gz &&
+        rm "\${acc}"*.fastq;
+    elif [ "\$count" -eq 2 ]; then
+        #if $adv.skip_technical:
+            gzip -c "\${data[0]}" > output/"\${acc}"_forward.fastqsanger.gz &&
+            gzip -c "\${data[1]}" > output/"\${acc}"_reverse.fastqsanger.gz &&
+        #else
+            gzip -c "\${data[0]}" > outputOther/"\${data[0]}"sanger.gz &&
+            gzip -c "\${data[1]}" > outputOther/"\${data[1]}"sanger.gz &&
+        #end if
+        rm "\${data[0]}" &&
+        rm "\${data[1]}";
+    else
+        for file in \${data[*]}; do
+            gzip -c "\$file" > outputOther/"\$file"sanger.gz &&
+            rm "\$file";
+        done;
+    fi;
+    #if $input.input_select=="file_list":
+        ) ; done
+
+        ;
+    #elif  $input.input_select=="accession_number":
+    );
+    #end if
+    ]]>
+    </command>
+    <inputs>
+        <expand macro="input_conditional"/>
+        <section name="adv" title="Advanced Options" expanded="False">
+            <param name="minlen" type="integer" label="Minimum read length" optional="true" help="Filter by sequence length. Will dump only reads longer or equal to this value." argument="--min-read-len"/>
+            <param name="split" type="select" display="radio" label="Select how to split the spots" help="This option will only be used when there are multiple reads per spot (for example paired-end).">
+                <option value="--split-3">--split-3: write properly paired biological reads into different files and single reads in another file</option>
+                <option value="--split-files">--split-files: write reads into different files (forward and reverse may not match if one read is empty)</option>
+                <option value="--split-spot">--split-spot: split spots into reads (only one output file)</option>
+                <option value="--concatenate-reads">--concatenate-reads: writes whole spots into one file</option>
+            </param>
+            <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="--include-technical" checked="True" label="Dump only biological reads" help="Will not be used if --split-3 is selected." argument="--skip-technical/--include-technical"/>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="log" format="txt" label="fasterq-dump log"/>
+        <collection name="list_paired" type="list:paired" label="Pair-end data (fasterq-dump)">
+
+        <!-- Use named regex group to grab pattern
+             <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list
+             identifier in the nested collection and identifier_1 is either
+             forward or reverse (for instance samp1_forward.fq).
+        -->
+        
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger.gz" directory="output" ext="fastqsanger.gz" />
+        </collection>
+        <collection name="output_collection" type='list' label="Single-end data (fasterq-dump)">
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)__single\.fastqsanger.gz" directory="output" ext='fastqsanger.gz'/>
+        </collection>
+        <collection name="output_collection_other" type='list' label="Other data (fasterq-dump)">
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.fastqsanger\.gz" directory="outputOther" format="fastqsanger.gz"/>
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_select" value="accession_number"/>
+            <param name="accession" value="ERR086330"/>
+            <output_collection name="list_paired" type="list:paired">
+                <element name="ERR086330">
+                    <element name="forward" file="ERR086330_1.fastq.gz" decompress="True">
+                    </element>
+                    <element name="reverse" file="ERR086330_2.fastq.gz" decompress="True">
+                    </element>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <param name="input_select" value="accession_number"/>
+            <param name="accession" value="SRR002702"/>
+            <param name="split" value="--split-files"/>
+            <param name="skip_technical" value="False"/>
+            <output_collection name="output_collection_other" type="list">
+                <element name="SRR002702_1" file="SRR002702_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
+                <element name="SRR002702_2" file="SRR002702_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
+            </output_collection>
+        </test>
+        <test>
+            <param name="input_select" value="file"/>
+            <param name="file" value="SRR522874.sra"/>
+            <param name="split" value="--split-files"/>
+            <param name="skip_technical" value="True"/>
+            <output_collection name="list_paired" type="list:paired">
+                <element name="SRR522874.sra">
+                    <element name="forward" file="SRR522874.sra_2.fastq.gz" decompress="True">
+                    </element>
+                    <element name="reverse" file="SRR522874.sra_4.fastq.gz" decompress="True">
+                    </element>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <param name="input_select" value="file"/>
+            <param name="file" value="SRR522874.sra"/>
+            <param name="split" value="--split-files"/>
+            <param name="skip_technical" value="False"/>
+            <output_collection name="output_collection_other" type="list">
+                <element name="SRR522874.sra_1" file="SRR522874.sra_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
+                <element name="SRR522874.sra_2" file="SRR522874.sra_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
+                <element name="SRR522874.sra_3" file="SRR522874.sra_3.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
+                <element name="SRR522874.sra_4" file="SRR522874.sra_4.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
+            </output_collection>
+        </test>
+        <test>
+            <param name="input_select" value="file_list"/>
+            <param name="file_list" value="list_sra"/>
+            <param name="minlen" value="21"/>
+            <output_collection name="output_collection_other" type="list">
+                <element name="SRR522874__single" file="SRR522874.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
+            </output_collection>
+            <output_collection name="list_paired" type="list:paired">
+                <element name="SRR522874">
+                    <element name="forward" file="SRR522874_1.fastq.gz" decompress="True">
+                    </element>
+                    <element name="reverse" file="SRR522874_2.fastq.gz" decompress="True">
+                    </element>
+                </element>
+            </output_collection>
+            <output_collection name="output_collection" type="list">
+                <element name="SRR002702" file="SRR002702_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does?**
+
+This tool extracts data (in fastq_ format) from the Short Read Archive (SRA) at the National Center for Biotechnology Information (NCBI). It is based on the fasterq-dump_ utility of the SRA Toolkit.
+
+**How to use it?**
+
+There are three ways in which you can download data:
+
+ 1. Data for single accession
+ 2. Multiple datasets using a list of accessions
+ 3. Extract data from already uploaded SRA dataset
+
+Below we discuss each in detail.
+
+------
+
+**Uploading data for a single accession**
+
+When you type a single accession number (e.g., `SRR1582967`) into **Accession** box and click **Execute** the tool will fetch data for you. 
+
+-----
+
+**Uploading multiple datasets using a list of accessions**
+
+A more realistic scenario is when you want to upload a number of datasets at once. To do this you need a list of accession, where there is only one accession per line (see below for information on how to generate such a file). Once you have this file:
+
+ 1. Upload it into your history using Galaxy's upload tool
+ 2. Once the list of accessions is uploaded choose *List of SRA accessions, one per line* from **select input type** dropdown
+ 3. Choose uploaded file within the **sra accession list** field
+ 4. Click **Execute**
+
+-----
+
+**Extract data from already uploaded SRA dataset**
+
+If a SRA dataset is present in the history, it can be converted into fastq dataset by setting **select input type** drop-down to *SRA archive in current history*. Just like in the case of extracting data for single accession number the following applies:
+
+ - if data is paired-ended (or mate-pair) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see example below).
+ - if data is single ended, a standard fastq dataset will be produced
+
+-----
+
+**Output**
+
+In every case, fastq datasets produced will be saved in Galaxy's history as a collection_ - a single history element containing multiple datasets. 
+In fact, three collections will be produced: one containing paired-end data, another containing single-end data, and a third one which contains reads which could not be classified.
+Some collections may be empty if the accessions provided in the list does not contain one of the type of data.
+
+.. class:: warningmark
+
+When you decide to dump technical reads (in Advanced Options Dump only biological reads is set to No), you will probably find your PAIRED data in the other data collection as it is impossible to determine if it was 2 biological reads or one biological and one technical.
+
+.. class:: warningmark
+
+By default, only biological reads are dumped and in case of PAIRED dataset only the spots which have both reads will be in the paired-end collection. The remaining single reads will be in the other colletion.
+To keep all reads, and maybe do not have the same number of reads in forward and reverse use the --split-files option in Advanced Options, Select how to split the spots.
+
+@ACCESSION_LIST_HOWTO@
+
+-----
+
+
+.. _fastq: https://en.wikipedia.org/wiki/FASTQ_format
+.. _fastq-dump: https://ncbi.github.io/sra-tools/fastq-dump.html
+.. _fasterq-dump: https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump
+.. _collection: https://galaxyproject.org/tutorials/collections/
+.. _link: http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=studies
+
+@SRATOOLS_ATTRRIBUTION@
+
+]]>
+    </help>
+    <expand macro="citation"/>
+  </tool>
--- a/fastq_dump.xml	Tue Dec 04 15:00:55 2018 -0500
+++ b/fastq_dump.xml	Tue Sep 10 11:35:35 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="fastq_dump" name="Download and Extract Reads in FASTA/Q" version="@VERSION@.3">
+<tool id="fastq_dump" name="Download and Extract Reads in FASTA/Q" version="@VERSION@.4">
     <description>format from NCBI SRA</description>
     <macros>
         <import>sra_macros.xml</import>
@@ -70,6 +70,12 @@
     #elif str( $outputformat ) == "fastqsanger.bz2":   
         --bzip2
     #end if
+
+    #if str($adv.table) != "":
+        --table $adv.table
+    #end if
+
+
     #if $input.input_select=="file":
         --stdout
         "$input.file" > "$output_file"
@@ -80,6 +86,7 @@
     #end if
 
     #if $input.input_select=="file_list":
+        "\$acc"
         ) ; done
 
         ;
@@ -126,6 +133,7 @@
             <param name="spotgroups" type="text" label="Filter by spot-groups" optional="true" argument="--spot-groups"/>
             <param name="clip" type="boolean" truevalue="--clip" falsevalue="" argument="--clip" label="Apply left and right clips" />
             <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads" argument="--skip-technical"/>
+            <param name="table" label="Table name within cSRA object" type="text" value="" optional="true" help="For SRA of noisy long-reads put SEQUENCE" argument="--table"/>
         </section>
     </inputs>
     <outputs>
@@ -226,7 +234,15 @@
             <output_collection name="output_collection" type="list">
                 <element name="SRR1993644" file="SRR1993644.fastqsanger"/>
             </output_collection>
-        </test>            
+        </test>
+        <test>
+            <param name="input_select" value="accession_number"/>
+            <param name="outputformat" value="fastqsanger.gz"/>
+            <param name="accession" value="SRR6982805"/>
+            <param name="maxID" value="2"/>
+            <param name="table" value="SEQUENCE"/>
+            <output name="output_accession" file="SRR6982805.fastqsanger.gz" ftype="fastqsanger.gz" decompress="True"/>
+        </test>  
     </tests>
     <help><![CDATA[
 **What it does?**
--- a/sra_macros.xml	Tue Dec 04 15:00:55 2018 -0500
+++ b/sra_macros.xml	Tue Sep 10 11:35:35 2019 -0400
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@VERSION@">2.9.1</token>
+    <token name="@VERSION@">2.10</token>
     <token name="@ACCESSIONS_FROM_FILE@">
         grep '^[[:space:]]*[E|S|D]RR[0-9]\{1,\}[[:space:]]*$'
     </token>
@@ -14,7 +14,7 @@
 
     <macro name="requirements">
         <requirements>
-            <requirement type="package" version="2.9.1">sra-tools</requirement>
+            <requirement type="package" version="2.10">sra-tools</requirement>
             <requirement type="package" version="1.9">samtools</requirement>
         </requirements>
     </macro>
--- a/sra_pileup.xml	Tue Dec 04 15:00:55 2018 -0500
+++ b/sra_pileup.xml	Tue Sep 10 11:35:35 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="sra_pileup" name="Download and Generate Pileup Format" version="@VERSION@.2">
+<tool id="sra_pileup" name="Download and Generate Pileup Format" version="@VERSION@.3">
     <description>from NCBI SRA</description>
     <macros>
         <import>sra_macros.xml</import>
Binary file test-data/ERR086330_1.fastq.gz has changed
Binary file test-data/ERR086330_2.fastq.gz has changed
Binary file test-data/SRR002702_1.fastq.gz has changed
Binary file test-data/SRR002702_2.fastq.gz has changed
Binary file test-data/SRR522874.fastq.gz has changed
Binary file test-data/SRR522874.sra has changed
Binary file test-data/SRR522874.sra_1.fastq.gz has changed
Binary file test-data/SRR522874.sra_2.fastq.gz has changed
Binary file test-data/SRR522874.sra_3.fastq.gz has changed
Binary file test-data/SRR522874.sra_4.fastq.gz has changed
Binary file test-data/SRR522874_1.fastq.gz has changed
Binary file test-data/SRR522874_2.fastq.gz has changed
Binary file test-data/SRR6982805.fastqsanger.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/list_sra	Tue Sep 10 11:35:35 2019 -0400
@@ -0,0 +1,2 @@
+SRR522874
+SRR002702