view paired_fastq_filtering.xml @ 5:378565f5a875 draft

Uploaded
author petr-novak
date Fri, 22 Nov 2019 07:56:48 -0500
parents a4cd8608ef6b
children f224513123a1
line wrap: on
line source

<tool id="paired_fastq_filtering" name="Preprocessing of fastq paired-reads">
  <stdio>
     <exit_code range="1:" level="fatal" description="Error" />
  </stdio>
  <description>
    Preprocessing of paired reads fastq files
    including trimming, quality filtering, cutadapt filtering and interlacing. Broken
    pairs are discarded.
  </description>
  <requirements>
    <requirement type="package">blast</requirement>
    <requirement type="package">cutadapt</requirement>
    <requirement type="package">bioconductor-shortread</requirement>
    <requirement type="package">r-optparse</requirement>
  </requirements>
  <command interpreter="bash">
    paired_fastq_filtering_wrapper.sh -a ${A} -b ${B}  -o ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output}

    #if $sampling.sequence_sampling :
    -n $sampling.sample_size
    #end if

    #if $trimming.sequence_trimming :
    -e $trimming.trim_end -s $trimming.trim_start
    #end if

    #if $cutadapt.use_custom :
    -C "${cutadapt.custom_options}"
    #end if

    #if $similarity_filtering.include :
    -F "${similarity_filtering.filter_database}"
    #end if

  </command>

  <inputs>
    <param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads" />

    <param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads" />

    <conditional name="sampling">
      <param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Sequence sampling"/>
	    <when value="false">
        <!-- do nothing here -->
      </when>
      <when value="true">
   		  <param name="sample_size" type="integer" label="Sample size(number of pairs)" help="How many sequence pairs should be in resulting dataset" value="500000" min="0"/>
      </when>
    </conditional>

    <param type="integer" name="cut_off" label="Quality cut-off" value="10" min="0" help="see below how to correctly set quality cut-off" />
    <param type="integer" name="percent_above" label="percent above cutoff" value="95" min="0"
           help="Percent of bases in sequence that must have quality equal to / higher than cut-off value" />

    <conditional name="trimming">
      <param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim sequences"/>
      <when value="false">
        <!-- do nothing here -->
      </when>      
      <when value="true">
        <param type="integer" name="trim_start" label="trimming - start position" value="1" min="1"
               help="sequences are trimmed at specified start" />
        <param type="integer" name="trim_end" label="trimming - end position" value="100" min="1"
               help="sequences are trimmed to specified end position, shorted sequences are discarded" />
      </when>      

    </conditional>
   	<param name="max_n" type="integer" label="maximum Ns" help="Maximum number of Ns in sequence" value="0" min="0" max="10"/>

    <conditional name="cutadapt">
      <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Do you want to use custom cutadapt options"/>
	    <when value="false">
        <!-- do nothing here -->
      </when>
      <when value="true">
   		  <param name="custom_options" type="text" area="True" size="8x30"  label="Cutadapt custom options" help="Consult cutadapt for usage" value="">
          <sanitizer sanitize="False"/>
          </param>>
      </when>
    </conditional>

    <conditional name="similarity_filtering">
	    <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/>
	    <when value="false">
        <!-- do nothing here -->
      </when>
      <when value="true">
        
   		  <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in fasta format. Sequence reads which has at least 90% similarity over 90% of length to sequence in filter database will be removed. This is suitable option if you want to remove organele DNA or contamination"/>
      </when>
    </conditional>

    <param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="False" label="Rename sequences" help="By default, original sequence ID are used, in case your sequences do not follow proper naming scheme to label paired-end read mate, use this option. All read pairs must be complete!"/>
  </inputs>


  <outputs>
    <data format="fasta" name="paired" label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/>
    <data format="png" name="png_output" label="nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>"
  </outputs>


  <tests>
    <test>
      <param name="A" value="ERR215189_1_part.fastq.gz" />
      <param name="B" value="ERR215189_2_part.fastq.gz" />
      <param name="max_n" value="0"/>
      <param name="cut_off" value="10" />
      <param name="percent_above" value="95" />
      <output name="output" value="paired_output.fasta" />
      <output name="png_output" value="paired_output.png" />
    </test>
  </tests>

  <help>
**What it does**

This tool is designed to make memory efficient preprocessing of two
fastq files. Output of this file can be used as input of RepeatExplorer clustering.
Input files can be in GNU zipped archive (.gz extension).
Reads are filtered based on the quality, presence of N bases and
adapters. Two input fastq files are procesed in parallel. Only complete pair
are kept. As the input files are process in chunks, it is required that
pair reads are complete and in the same order in both input files. All
reads which pass the quality filter fill be writen into output files.
If sampling is specified, only sample of sequences will be
returned. Cutadapt us run with this options::

    --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
    --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
    --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
    --anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
    --anywhere='CAAGCAGAAGACGGCATACGAGAT'
    --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
    --error-rate=0.05
    --times=1 --overlap=15 --discard


**Order of fastq files processing**

1. Trimming (optional)       
#. Filter by quality      
#. Discard single reads, keep complete pairs
#. Cutadapt filtering 
#. Discard single reads, keep complete pairs    
#. Sampling (optional)         
#. Interlacing two fasta files

**Quality setting cut-off**

To correctly set quality cut-off, you need to know how the quality is encoded in your fastq file, default
filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::


      Default filtering cut-off
                |   
                |
                V
      SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
      ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
      ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
      .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
      LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
      !"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
      |                         |    |        |                              |                     |
     33                        59   64       73                            104                   126
      0........................26...31.......40                                
                               -5....0........9.............................40 
                                     0........9.............................40 
                                        3.....9.............................40 
      0.2......................26...31........41                              
    
     S - Sanger        Phred+33,  raw reads typically (0, 40)
     X - Solexa        Solexa+64, raw reads typically (-5, 40)
     I - Illumina 1.3+ Phred+64,  raw reads typically (0, 40)
     J - Illumina 1.5+ Phred+64,  raw reads typically (3, 40)
         with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold) 
         (Note: See discussion above).
     L - Illumina 1.8+ Phred+33,  raw reads typically (0, 41)
    
  </help>    
</tool>