Mercurial > repos > petr-novak > re_utils

diff paired_fastq_filtering.xml @ 0:a4cd8608ef6b draft
Uploaded
author: petr-novak
date: Mon, 01 Apr 2019 07:56:36 -0400
children: 378565f5a875
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/paired_fastq_filtering.xml	Mon Apr 01 07:56:36 2019 -0400
@@ -0,0 +1,181 @@
+<tool id="paired_fastq_filtering" name="Preprocessing of fastq paired-reads">
+  <description>
+    Preprocessing of paired reads fastq files
+    including trimming, quality filtering, cutadapt filtering and interlacing. Broken
+    pairs are discarded.
+  </description>
+  <requirements>
+    <requirement type="package">blast</requirement>
+    <requirement type="package">cutadapt</requirement>
+    <requirement type="package">bioconductor-shortread</requirement>
+    <requirement type="package">r-optparse</requirement>
+  </requirements>
+  <command interpreter="bash">
+    paired_fastq_filtering_wrapper.sh -a ${A} -b ${B}  -o ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output}
+
+    #if $sampling.sequence_sampling :
+    -n $sampling.sample_size
+    #end if
+
+    #if $trimming.sequence_trimming :
+    -e $trimming.trim_end -s $trimming.trim_start
+    #end if
+
+    #if $cutadapt.use_custom :
+    -C "${cutadapt.custom_options}"
+    #end if
+
+    #if $similarity_filtering.include :
+    -F "${similarity_filtering.filter_database}"
+    #end if
+
+  </command>
+
+  <inputs>
+    <param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads" />
+
+    <param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads" />
+
+    <conditional name="sampling">
+      <param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Sequence sampling"/>
+	    <when value="false">
+        <!-- do nothing here -->
+      </when>
+      <when value="true">
+   		  <param name="sample_size" type="integer" label="Sample size(number of pairs)" help="How many sequence pairs should be in resulting dataset" value="500000" min="0"/>
+      </when>
+    </conditional>
+
+    <param type="integer" name="cut_off" label="Quality cut-off" value="10" min="0" help="see below how to correctly set quality cut-off" />
+    <param type="integer" name="percent_above" label="percent above cutoff" value="95" min="0"
+           help="Percent of bases in sequence that must have quality equal to / higher than cut-off value" />
+
+    <conditional name="trimming">
+      <param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim sequences"/>
+      <when value="false">
+        <!-- do nothing here -->
+      </when>      
+      <when value="true">
+        <param type="integer" name="trim_start" label="trimming - start position" value="1" min="1"
+               help="sequences are trimmed at specified start" />
+        <param type="integer" name="trim_end" label="trimming - end position" value="100" min="1"
+               help="sequences are trimmed to specified end position, shorted sequences are discarded" />
+      </when>      
+
+    </conditional>
+   	<param name="max_n" type="integer" label="maximum Ns" help="Maximum number of Ns in sequence" value="0" min="0" max="10"/>
+
+    <conditional name="cutadapt">
+      <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Do you want to use custom cutadapt options"/>
+	    <when value="false">
+        <!-- do nothing here -->
+      </when>
+      <when value="true">
+   		  <param name="custom_options" type="text" area="True" size="8x30"  label="Cutadapt custom options" help="Consult cutadapt for usage" value="">
+          <sanitizer sanitize="False"/>
+          </param>>
+      </when>
+    </conditional>
+
+    <conditional name="similarity_filtering">
+	    <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/>
+	    <when value="false">
+        <!-- do nothing here -->
+      </when>
+      <when value="true">
+        
+   		  <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in fasta format. Sequence reads which has at least 90% similarity over 90% of length to sequence in filter database will be removed. This is suitable option if you want to remove organele DNA or contamination"/>
+      </when>
+    </conditional>
+
+    <param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="False" label="Rename sequences" help="By default, original sequence ID are used, in case your sequences do not follow proper naming scheme to label paired-end read mate, use this option. All read pairs must be complete!"/>
+  </inputs>
+
+
+  <outputs>
+    <data format="fasta" name="paired" label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/>
+    <data format="png" name="png_output" label="nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>"
+  </outputs>
+
+
+  <tests>
+    <test>
+      <param name="A" value="ERR215189_1_part.fastq.gz" />
+      <param name="B" value="ERR215189_2_part.fastq.gz" />
+      <param name="max_n" value="0"/>
+      <param name="cut_off" value="10" />
+      <param name="percent_above" value="95" />
+      <output name="output" value="paired_output.fasta" />
+      <output name="png_output" value="paired_output.png" />
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+This tool is designed to make memory efficient preprocessing of two
+fastq files. Output of this file can be used as input of RepeatExplorer clustering.
+Input files can be in GNU zipped archive (.gz extension).
+Reads are filtered based on the quality, presence of N bases and
+adapters. Two input fastq files are procesed in parallel. Only complete pair
+are kept. As the input files are process in chunks, it is required that
+pair reads are complete and in the same order in both input files. All
+reads which pass the quality filter fill be writen into output files.
+If sampling is specified, only sample of sequences will be
+returned. Cutadapt us run with this options::
+
+    --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
+    --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
+    --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
+    --anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
+    --anywhere='CAAGCAGAAGACGGCATACGAGAT'
+    --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
+    --error-rate=0.05
+    --times=1 --overlap=15 --discard
+
+
+**Order of fastq files processing**
+
+1. Trimming (optional)       
+#. Filter by quality      
+#. Discard single reads, keep complete pairs
+#. Cutadapt filtering 
+#. Discard single reads, keep complete pairs    
+#. Sampling (optional)         
+#. Interlacing two fasta files
+
+**Quality setting cut-off**
+
+To correctly set quality cut-off, you need to know how the quality is encoded in your fastq file, default
+filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::
+
+
+      Default filtering cut-off
+                |   
+                |
+                V
+      SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
+      ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
+      ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
+      .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
+      LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
+      !"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+      |                         |    |        |                              |                     |
+     33                        59   64       73                            104                   126
+      0........................26...31.......40                                
+                               -5....0........9.............................40 
+                                     0........9.............................40 
+                                        3.....9.............................40 
+      0.2......................26...31........41                              
+    
+     S - Sanger        Phred+33,  raw reads typically (0, 40)
+     X - Solexa        Solexa+64, raw reads typically (-5, 40)
+     I - Illumina 1.3+ Phred+64,  raw reads typically (0, 40)
+     J - Illumina 1.5+ Phred+64,  raw reads typically (3, 40)
+         with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold) 
+         (Note: See discussion above).
+     L - Illumina 1.8+ Phred+33,  raw reads typically (0, 41)
+    
+  </help>    
+</tool>
+
author	petr-novak
date	Mon, 01 Apr 2019 07:56:36 -0400
parents
children	378565f5a875