Mercurial > repos > iuc > sra_tools

<tool id="fastq_dump" name="Extract reads" version="@VERSION@.1">
    <description>in FASTQ/A format from NCBI SRA.</description>
    <macros>
        <import>sra_macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <version_command>fastq-dump --version</version_command>
    <command detect_errors="exit_code">
        <![CDATA[

    #if $input.input_select=="file_list":
      for acc in `cat $input.file_list` ;
      do
    #elif $input.input_select=="accession_number":
      acc="$input.accession" &&
    #end if

    #if $input.input_select=="file_list" or $input.input_select=="accession_number":
          [ ""\$acc" =~ ^[E|S|D]RR[0-9]{1,}$" ] && (
    #end if

    ## Need to set the home directory to the current working directory,
    ## else the tool tries to write to home/.ncbi and fails when used
    ## with a cluster manager.
    export HOME=\$PWD &&
    vdb-config --restore-defaults &&
    #if $input.input_select == "file":
        fastq-dump --log-level fatal --accession '${input.file.name}'
    #else:
        vdb-config -s "/repository/user/main/public/root=\$PWD" &&
        ## Do not use prefetch if region is specified, to avoid downloading
        ## the complete sra file.
        #if ( str( $adv.region ) == "" ) and ( str( $adv.minID ) == "" ) and ( str( $adv.maxID ) == "" ):
            ASCP_PATH=`command -v ascp` &&
            ASCP_KEY=`dirname \$ASCP_PATH`/asperaweb_id_dsa.openssh || true &&
            prefetch -X 200G --ascp-path "\$ASCP_PATH|\$ASCP_KEY" "\$acc" &&
            ## Duplicate vdb-config, in case settings changed between prefetch and
            ## dump command.
            vdb-config -s "/repository/user/main/public/root=\$PWD" &&
        #end if
        fastq-dump --accession "\$acc"
        --split-files
    #end if
    --defline-seq '@\$sn[_\$rn]/\$ri'

    $adv.split
    #if str( $adv.alignments ) == "aligned":
        --aligned
    #end if
    #if str( $adv.alignments ) == "unaligned":
        --unaligned
    #end if
    #if str( $adv.minID ) != "":
        --minSpotId "$adv.minID"
    #end if
    #if str( $adv.maxID ) != "":
        --maxSpotId "$adv.maxID"
    #end if
    #if str( $adv.minlen ) != "":
        --minReadLen "$adv.minlen"
    #end if
    #if str( $adv.readfilter ) != "":
        --read-filter "$adv.readfilter"
    #end if
    #if str( $adv.region ) != "":
        --aligned-region "$adv.region"
    #end if
    #if str( $adv.spotgroups ) != "":
        --spot-groups "$adv.spotgroups"
    #end if
    #if str( $adv.matepairDist ) != "":
        --matepair-distance "$adv.matepairDist"
    #end if
    $adv.clip
    $adv.skip_technical

    #if str( $outputformat ) == "fasta":
        --fasta
    #end if
    #if $input.input_select=="file":
        --stdout
        "$input.file" > "$output_file"
    #elif $input.input_select=="file_list":
        "\$acc"
    #else:
         --stdout
        "\$acc" > "$output_accession" )
    #end if

    #if $input.input_select=="file_list":
    ) ; done

    ;


    for i in `ls *.fast* | cut -f 1 -d '_' | uniq` ; do
      count=`ls \$i* | wc -l` ;
      data=(\$(ls -d \$i*));

      if [ "\$count" -eq 2 ]; then
         mv "\${data[0]}" "\${data[0]}"_forward.$outputformat;  mv "\${data[1]}" "\${data[1]}"_reverse.$outputformat ;
      elif [ "\$count" -eq 1 ]; then
         mv "\${data[0]}" "\${data[0]}"__single.$outputformat ;
      fi;
    done


    #end if


    ]]>
    </command>
    <inputs>
        <expand macro="input_conditional"/>
        <param name="outputformat" type="select" label="select output format">
            <option value="fastqsanger">fastq</option>
            <option value="fasta">fasta</option>
        </param>
        <section name="adv" title="Advanced Options" expanded="False">
            <param name="minID" type="integer" label="minimum spot ID" optional="true"/>
            <param name="maxID" type="integer" label="maximum spot ID" optional="true"/>
            <param name="minlen" type="integer" label="minimum read length" optional="true"/>
            <param name="split" type="boolean" checked="true" truevalue="--split-spot" falsevalue="">
                <label>split spot by read pairs</label>
            </param>
            <expand macro="alignments"/>
            <expand macro="region"/>
            <expand macro="matepairDist"/>
            <param name="readfilter" type="select" value="">
                <label>filter by value</label>
                <option value="">None</option>
                <option value="pass">pass</option>
                <option value="reject">reject</option>
                <option value="criteria">criteria</option>
                <option value="redacted">redacted</option>
            </param>
            <param name="spotgroups" type="text" label="filter by spot-groups" optional="true"/>
            <param name="clip" type="boolean" truevalue="--clip" falsevalue="">
                <label>apply left and right clips</label>
            </param>
            <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads"/>
        </section>
    </inputs>
    <outputs>
      <collection name="list_paired" type="list:paired" label="Pair-end Fast(q|a)">
        <filter>input['input_select'] == "file_list"</filter>
        <!-- Use named regex group to grab pattern
             <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list
             identifier in the nested collection and identifier_1 is either
             forward or reverse (for instance samp1_forward.fq).
        -->
        <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq_(?P&lt;identifier_1&gt;[^_]+)\.fastq" ext="fastqsanger" visible="false" />
        <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fasta_(?P&lt;identifier_1&gt;[^_]+)\.fasta" ext="fasta" visible="false" />
      </collection>
      <collection name="output_collection" type='list' label="Single-end Fast(q|a)">
        <filter>input['input_select'] == "file_list"</filter>
        <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq__single\.fastq" directory="." ext='fastqsanger'/>
        <discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fasta__single\.fasta" directory="." ext='fasta'/>
      </collection>
      <data format="fastqsanger" name="output_accession" >
        <filter>input['input_select'] == "accession_number"</filter>
        <change_format>
          <when input="outputformat" value="fasta" format="fasta"/>
        </change_format>
      </data>
      <data format="fastqsanger" name="output_file" label="${input.file.name}.${outputformat}">
        <filter>input['input_select'] == "file"</filter>
        <change_format>
          <when input="outputformat" value="fasta" format="fasta"/>
        </change_format>
      </data>
    </outputs>
    <tests>
      <test>
        <param name="input_select" value="accession_number"/>
        <param name="outputformat" value="fastqsanger"/>
        <param name="accession" value="SRR044777"/>
        <param name="skip_technical" value="True"/>
        <output name="output_accession">
          <assert_contents>
            <not_has_text text="rRNA_primer"/>
            <has_text text="F47USSH02GNP1D" />
          </assert_contents>
        </output>
      </test>
      <test>
        <param name="input_select" value="accession_number"/>
        <param name="outputformat" value="fastqsanger"/>
        <param name="accession" value="SRR925743"/>
        <param name="maxID" value="5"/>
        <output name="output_accession" file="fastq_dump_result.fastq" ftype="fastqsanger"/>
      </test>
      <test>
        <param name="input_select" value="file_list"/>
        <param name="outputformat" value="fastqsanger"/>
        <param name="file_list" value="list_pe"/>
        <param name="maxID" value="5"/>
        <output_collection name="list_paired" type="list:paired">
          <element name="DRR015708">
            <element name="forward" file="DRR015708_forward.fastqsanger">
            </element>
            <element name="reverse" file="DRR015708_reverse.fastqsanger">
            </element>
          </element>
        </output_collection>
      </test>
      <test>
        <param name="input_select" value="file_list"/>
        <param name="outputformat" value="fastqsanger"/>
        <param name="file_list" value="list_pe2"/>
        <param name="maxID" value="5"/>
        <output_collection name="list_paired" type="list:paired">
          <element name="ERR027433">
            <element name="forward" file="ERR027433_forward.fastqsanger">
            </element>
            <element name="reverse" file="ERR027433_reverse.fastqsanger">
            </element>
          </element>
        </output_collection>
      </test>
      <test>
        <param name="input_select" value="file_list"/>
        <param name="outputformat" value="fastqsanger"/>
        <param name="file_list" value="list_se"/>
        <param name="maxID" value="5"/>
        <output_collection name="output_collection" type="list">
          <element name="SRR1993644" file="SRR1993644.fastqsanger"/>
        </output_collection>
      </test>
    </tests>
    <help>
        This tool extracts reads from SRA archives using fastq-dump.
        The fastq-dump program is developed at NCBI, and is available at
        http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software.

        NB: Single-end or pair-end collections may be empty if given SRRs LibraryLayout contains only either SINGLE or PAIRED respectively
        @SRATOOLS_ATTRRIBUTION@
    </help>
    <expand macro="citation"/>
  </tool>
author	iuc
date	Wed, 22 Mar 2017 05:23:31 -0400
parents	62e4d56ebb6f
children	c7620aa7e1f0