Mercurial > repos > iuc > sra_tools
diff fastq_dump.xml @ 7:c7620aa7e1f0 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sra-tools commit d1347141d384ed404f674d7ce408b6769e763ea1
author | iuc |
---|---|
date | Wed, 10 May 2017 10:45:41 -0400 |
parents | 30775c836c77 |
children | 1920e0508831 |
line wrap: on
line diff
--- a/fastq_dump.xml Wed Mar 22 05:23:31 2017 -0400 +++ b/fastq_dump.xml Wed May 10 10:45:41 2017 -0400 @@ -1,5 +1,5 @@ -<tool id="fastq_dump" name="Extract reads" version="@VERSION@.1"> - <description>in FASTQ/A format from NCBI SRA.</description> +<tool id="fastq_dump" name="Extract reads in Fastq/a" version="@VERSION@.2"> + <description>format from NCBI SRA</description> <macros> <import>sra_macros.xml</import> </macros> @@ -9,14 +9,21 @@ <![CDATA[ #if $input.input_select=="file_list": - for acc in `cat $input.file_list` ; - do + + for acc in `cat $input.file_list` ; + do + #elif $input.input_select=="accession_number": - acc="$input.accession" && + + ## Stripping leading and trailing spaces in case user typed them in + acc="${input.accession}" && + #end if #if $input.input_select=="file_list" or $input.input_select=="accession_number": - [ ""\$acc" =~ ^[E|S|D]RR[0-9]{1,}$" ] && ( + + [ ""\$acc" =~ ^[E|S|D]RR[0-9]{1,}$" ] && ( + #end if ## Need to set the home directory to the current working directory, @@ -74,38 +81,35 @@ $adv.clip $adv.skip_technical - #if str( $outputformat ) == "fasta": - --fasta + #if str( $outputformat ) == "fastqsanger.gz": + --gzip + #elif str( $outputformat ) == "fastqsanger.bz2": + --bzip2 #end if #if $input.input_select=="file": --stdout "$input.file" > "$output_file" - #elif $input.input_select=="file_list": - "\$acc" - #else: - --stdout + + #elif $input.input_select=="accession_number": + --stdout "\$acc" > "$output_accession" ) #end if #if $input.input_select=="file_list": - ) ; done - - ; + ) ; done - - - + ; - for i in `ls *.fast* | cut -f 1 -d '_' | uniq` ; do - count=`ls \$i* | wc -l` ; - data=(\$(ls -d \$i*)); + for i in `ls *.fast* | cut -f 1 -d '_' | uniq` ; do + count=`ls \$i* | wc -l` ; + data=(\$(ls -d \$i*)); - if [ "\$count" -eq 2 ]; then - mv "\${data[0]}" "\${data[0]}"_forward.$outputformat; mv "\${data[1]}" "\${data[1]}"_reverse.$outputformat ; - elif [ "\$count" -eq 1 ]; then - mv "\${data[0]}" "\${data[0]}"__single.$outputformat ; - fi; - done + if [ "\$count" -eq 2 ]; then + mv "\${data[0]}" "\${data[0]}"_forward.$outputformat; mv "\${data[1]}" "\${data[1]}"_reverse.$outputformat ; + elif [ "\$count" -eq 1 ]; then + mv "\${data[0]}" "\${data[0]}"__single.$outputformat ; + fi; + done #end if @@ -115,129 +119,239 @@ </command> <inputs> <expand macro="input_conditional"/> - <param name="outputformat" type="select" label="select output format"> - <option value="fastqsanger">fastq</option> - <option value="fasta">fasta</option> + <param name="outputformat" type="select" display="radio" label="Select output format" help="Compression will greatly reduce the amount of space occupied by downloaded data. Downstream applications such as a short-read mappers will accept compressed data as input. Consider this example: an uncoimpressed 400 Mb fastq datasets compresses to 100 Mb or 80 Mb by gzip or bzip2, respectively. " argument="--gzip --bzip2"> + <option value="fastqsanger.gz">gzip compressed fastq</option> + <option value="fastqsanger">Uncompressed fastq</option> + <option value="fastqsanger.bz2">bzip2 compressed fastq</option> </param> <section name="adv" title="Advanced Options" expanded="False"> - <param name="minID" type="integer" label="minimum spot ID" optional="true"/> - <param name="maxID" type="integer" label="maximum spot ID" optional="true"/> - <param name="minlen" type="integer" label="minimum read length" optional="true"/> - <param name="split" type="boolean" checked="true" truevalue="--split-spot" falsevalue=""> - <label>split spot by read pairs</label> - </param> + <param name="minID" type="integer" label="Minimum spot ID" optional="true" help="Minimum spot id to be dumped." argument="--minSpotId"/> + <param name="maxID" type="integer" label="Maximum spot ID" optional="true" help="Maximum spot id to be dumped." argument="--maxSpotId"/> + <param name="minlen" type="integer" label="Minimum read length" optional="true" help="Filter by sequence length. Will dump only reads longer or equal to this value." argument="--minReadLen"/> + <param name="split" type="boolean" checked="true" truevalue="--split-spot" falsevalue="" label="Split spot by read pairs" help="Split spots into individual reads." argument="--split-spot"/> <expand macro="alignments"/> <expand macro="region"/> <expand macro="matepairDist"/> - <param name="readfilter" type="select" value=""> - <label>filter by value</label> + <param name="readfilter" type="select" value="" label="filter by value" argument="--read-filter"> <option value="">None</option> <option value="pass">pass</option> <option value="reject">reject</option> <option value="criteria">criteria</option> <option value="redacted">redacted</option> </param> - <param name="spotgroups" type="text" label="filter by spot-groups" optional="true"/> - <param name="clip" type="boolean" truevalue="--clip" falsevalue=""> - <label>apply left and right clips</label> - </param> - <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads"/> + <param name="spotgroups" type="text" label="Filter by spot-groups" optional="true" argument="--spot-groups"/> + <param name="clip" type="boolean" truevalue="--clip" falsevalue="" argument="--clip" label="Apply left and right clips" /> + <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads" argument="--skip-technical"/> </section> </inputs> <outputs> - <collection name="list_paired" type="list:paired" label="Pair-end Fast(q|a)"> - <filter>input['input_select'] == "file_list"</filter> + <collection name="list_paired" type="list:paired" label="Pair-end data (fastq-dump)"> + <filter>input['input_select'] == "file_list"</filter> + <!-- Use named regex group to grab pattern <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list identifier in the nested collection and identifier_1 is either forward or reverse (for instance samp1_forward.fq). --> - <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fastq_(?P<identifier_1>[^_]+)\.fastq" ext="fastqsanger" visible="false" /> - <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fasta_(?P<identifier_1>[^_]+)\.fasta" ext="fasta" visible="false" /> - </collection> - <collection name="output_collection" type='list' label="Single-end Fast(q|a)"> - <filter>input['input_select'] == "file_list"</filter> - <discover_datasets pattern="(?P<designation>.+)_\d+.fastq__single\.fastq" directory="." ext='fastqsanger'/> - <discover_datasets pattern="(?P<designation>.+)_\d+.fasta__single\.fasta" directory="." ext='fasta'/> - </collection> - <data format="fastqsanger" name="output_accession" > - <filter>input['input_select'] == "accession_number"</filter> - <change_format> - <when input="outputformat" value="fasta" format="fasta"/> - </change_format> - </data> - <data format="fastqsanger" name="output_file" label="${input.file.name}.${outputformat}"> - <filter>input['input_select'] == "file"</filter> - <change_format> - <when input="outputformat" value="fasta" format="fasta"/> - </change_format> - </data> + + <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fastq_(?P<identifier_1>[^_]+)\.fastqsanger" ext="fastqsanger" /> + <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fastq.gz_(?P<identifier_1>[^_]+)\.fastqsanger.gz" ext="fastqsanger.gz" /> + <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fastq.bz2_(?P<identifier_1>[^_]+)\.fastqsanger.bz2" ext="fastqsanger.bz2" /> + </collection> + <collection name="output_collection" type='list' label="Single-end data (fastq-dump)"> + <filter>input['input_select'] == "file_list"</filter> + <discover_datasets pattern="(?P<designation>.+)_\d+.fastq__single\.fastqsanger" directory="." ext='fastqsanger'/> + <discover_datasets pattern="(?P<designation>.+)_\d+.fastq.gz__single\.fastqsanger.gz" directory="." ext='fastqsanger.gz'/> + <discover_datasets pattern="(?P<designation>.+)_\d+.fastq.bz2__single\.fastqsanger.bz2" directory="." ext='fastqsanger.bz2'/> + </collection> + <data format="fastqsanger" name="output_accession" label="${input.accession} (fastq-dump)"> + <filter>input['input_select'] == "accession_number"</filter> + <change_format> + <when input="outputformat" value="fastqsanger.gz" format="fastqsanger.gz"/> + <when input="outputformat" value="fastqsanger.bz2" format="fastqsanger.bz2"/> + </change_format> + </data> + <data format="fastqsanger" name="output_file" label="${input.file.name} (fastq-dump)"> + <filter>input['input_select'] == "file"</filter> + <change_format> + <when input="outputformat" value="fastqsanger.gz" format="fastqsanger.gz"/> + <when input="outputformat" value="fastqsanger.bz2" format="fastqsanger.bz2"/> + </change_format> + </data> </outputs> <tests> - <test> - <param name="input_select" value="accession_number"/> - <param name="outputformat" value="fastqsanger"/> - <param name="accession" value="SRR044777"/> - <param name="skip_technical" value="True"/> - <output name="output_accession"> - <assert_contents> - <not_has_text text="rRNA_primer"/> - <has_text text="F47USSH02GNP1D" /> - </assert_contents> - </output> - </test> - <test> - <param name="input_select" value="accession_number"/> - <param name="outputformat" value="fastqsanger"/> - <param name="accession" value="SRR925743"/> - <param name="maxID" value="5"/> - <output name="output_accession" file="fastq_dump_result.fastq" ftype="fastqsanger"/> - </test> - <test> - <param name="input_select" value="file_list"/> - <param name="outputformat" value="fastqsanger"/> - <param name="file_list" value="list_pe"/> - <param name="maxID" value="5"/> - <output_collection name="list_paired" type="list:paired"> - <element name="DRR015708"> - <element name="forward" file="DRR015708_forward.fastqsanger"> - </element> - <element name="reverse" file="DRR015708_reverse.fastqsanger"> - </element> - </element> - </output_collection> - </test> - <test> - <param name="input_select" value="file_list"/> - <param name="outputformat" value="fastqsanger"/> - <param name="file_list" value="list_pe2"/> - <param name="maxID" value="5"/> - <output_collection name="list_paired" type="list:paired"> - <element name="ERR027433"> - <element name="forward" file="ERR027433_forward.fastqsanger"> - </element> - <element name="reverse" file="ERR027433_reverse.fastqsanger"> - </element> - </element> - </output_collection> - </test> - <test> - <param name="input_select" value="file_list"/> - <param name="outputformat" value="fastqsanger"/> - <param name="file_list" value="list_se"/> - <param name="maxID" value="5"/> - <output_collection name="output_collection" type="list"> - <element name="SRR1993644" file="SRR1993644.fastqsanger"/> - </output_collection> - </test> + <test> + <param name="input_select" value="accession_number"/> + <param name="outputformat" value="fastqsanger"/> + <param name="accession" value="SRR044777"/> + <param name="skip_technical" value="True"/> + <output name="output_accession"> + <assert_contents> + <not_has_text text="rRNA_primer"/> + <has_text text="F47USSH02GNP1D" /> + </assert_contents> + </output> + </test> + <test> + <param name="input_select" value="accession_number"/> + <param name="outputformat" value="fastqsanger.gz"/> + <param name="accession" value="SRR925743"/> + <param name="maxID" value="5"/> + <output name="output_accession" file="fastq_dump_result.fastq.gz" decompress="True"/> + </test> + <test> + <param name="input_select" value="accession_number"/> + <param name="outputformat" value="fastqsanger"/> + <param name="accession" value="SRR925743"/> + <param name="maxID" value="5"/> + <output name="output_accession" file="fastq_dump_result.fastq" ftype="fastqsanger"/> + </test> + <test> + <param name="input_select" value="file_list"/> + <param name="outputformat" value="fastqsanger"/> + <param name="file_list" value="list_pe"/> + <param name="maxID" value="5"/> + <output_collection name="list_paired" type="list:paired"> + <element name="DRR015708"> + <element name="forward" file="DRR015708_forward.fastqsanger"> + </element> + <element name="reverse" file="DRR015708_reverse.fastqsanger"> + </element> + </element> + </output_collection> + </test> + <test> + <param name="input_select" value="file_list"/> + <param name="outputformat" value="fastqsanger"/> + <param name="file_list" value="list_pe2"/> + <param name="maxID" value="5"/> + <output_collection name="list_paired" type="list:paired"> + <element name="ERR027433"> + <element name="forward" file="ERR027433_forward.fastqsanger"> + </element> + <element name="reverse" file="ERR027433_reverse.fastqsanger"> + </element> + </element> + </output_collection> + </test> + <test> + <param name="input_select" value="file_list"/> + <param name="outputformat" value="fastqsanger"/> + <param name="file_list" value="list_se"/> + <param name="maxID" value="5"/> + <output_collection name="output_collection" type="list"> + <element name="SRR1993644" file="SRR1993644.fastqsanger"/> + </output_collection> + </test> </tests> - <help> - This tool extracts reads from SRA archives using fastq-dump. - The fastq-dump program is developed at NCBI, and is available at - http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software. + <help><![CDATA[ +**What it does?** + +This tool extracts data (in fastq_ format) from the Short Read Archive (SRA) at the National Center for Biotechnology Information (NCBI). It is based on the fastq-dump_ utility of the SRA Toolkit. + +**How to use it?** + +There are three ways in which you can download data: + + 1. Data for single accession + 2. Multiple datasets using a list of accessions + 3. Extract data from already uploaded SRA dataset + +Below we discuss each in detail. + +------ + +**Uploading data for a single accession** + +When you type a single accession number (e.g., `SRR1582967`) into **Accession** box and click **Execute** the tool will fetch data for you. It is important to keep the following in mind: + + - if data is paired-ended (or mate-paired) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see an example dataset below) + - if data is single ended, a standard single fastq dataset will be produced + +----- + +**Uploading multiple datasets using a list of accessions** + +A more realistic scenario is when you want to upload a number of datasets at once. To do this you need a list of accession, where there is only one accession per line (see below for information on how to generate such a file). Once you have this file: + + 1. Upload it into your history using Galaxy's upload tool + 2. Once the list of accessions is uploaded choose *List of SRA accessions, one per line* from **select input type** dropdown + 3. Choose uploaded file within the **sra accession list** field + 4. Click **Execute** + +.. class:: warningmark + +Fastq datasets produced by this option will be saved in Galaxy's history as a collection_ - a single history element containing multiple datasets. In fact, two collections will be produced: one containing paired-end data and another containing single-end data. Single-end or pair-end collections may be empty if the accessions provided in the list contain only SINGLE or PAIRED data, respectively. + +----- + +**Extract data from already uploaded SRA dataset** + +If a SRA dataset is present in the history, it can be converted into fastq dataset by setting **select input type** drop-down to *SRA archive in current history*. Just like in the case of extracting data for single accession number the following applies: + + - if data is paired-ended (or mate-pair) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see example below). + - if data is single ended, a standard fastq dataset will be produced + +@ACCESSION_LIST_HOWTO@ + +----- + +**Paired-end (and mate-pair) data in fastq format** - NB: Single-end or pair-end collections may be empty if given SRRs LibraryLayout contains only either SINGLE or PAIRED respectively - @SRATOOLS_ATTRRIBUTION@ +Paired end datasets can be represented as two individual datasets: + +First dataset:: + + @1/1 + AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA + + + EGGEGGGDFGEEEAEECGDEGGFEEGEFGBEEDDECFEFDD@CDD<ED + @2/1 + AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA + + + HHHHHHEGFHEEFEEHEEHHGGEGGGGEFGFGGGGHHHHFBEEEEEFG + +Second dataset:: + + @1/2 + CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC + + + GHHHDFDFGFGEGFBGEGGEGEGGGHGFGHFHFHHHHHHHEF?EFEFF + @2/2 + CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC + + + HHHHHHHHHHHHHGHHHHHHGHHHHHHHHHHHFHHHFHHHHHHHHHHH + +Or a single *interleaved* dataset:: + + @1/1 + AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA + + + EGGEGGGDFGEEEAEECGDEGGFEEGEFGBEEDDECFEFDD@CDD<ED + @1/2 + CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC + + + GHHHDFDFGFGEGFBGEGGEGEGGGHGFGHFHFHHHHHHHEF?EFEFF + @2/1 + AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA + + + HHHHHHEGFHEEFEEHEEHHGGEGGGGEFGFGGGGHHHHFBEEEEEFG + @2/2 + CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC + + + HHHHHHHHHHHHHGHHHHHHGHHHHHHHHHHHFHHHFHHHHHHHHHHH + +---- + + +.. _fastq: https://en.wikipedia.org/wiki/FASTQ_format +.. _fastq-dump: https://ncbi.github.io/sra-tools/fastq-dump.html +.. _collection: https://galaxyproject.org/tutorials/collections/ +.. _link: http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=studies + +@SRATOOLS_ATTRRIBUTION@ + +]]> </help> <expand macro="citation"/> </tool>