re_utils: single_fastq_filtering.xml comparison

comparison single_fastq_filtering.xml @ 0:a4cd8608ef6b draft

Uploaded

author	petr-novak
date	Mon, 01 Apr 2019 07:56:36 -0400
parents
children	378565f5a875

comparison

equal deleted inserted replaced

--1:000000000000
+:a4cd8608ef6b
+<tool id="single_fastq_filtering" name="Preprocessing of fastq reads">
+<description>
+Preprocessing of fastq files
+including trimming, quality filtering, cutadapt filtering and sampling
+</description>
+<requirements>
+<requirement type="package">blast</requirement>
+<requirement type="package">cutadapt</requirement>
+<requirement type="package">bioconductor-shortread</requirement>
+<requirement type="package">r-optparse</requirement>
+</requirements>
+<command interpreter="bash">
+single_fastq_filtering_wrapper.sh -a ${A} -o ${output} -c ${cut_off} -p ${percent_above} -N ${max_n} -G ${png_output}
+#if $sampling.sequence_sampling :
+-n $sampling.sample_size
+#end if
+#if $trimming.sequence_trimming :
+-e $trimming.trim_end -s $trimming.trim_start
+#end if
+#if $cutadapt.use_custom :
+-C "${cutadapt.custom_options}"
+#end if
+#if $similarity_filtering.include :
+-F "${similarity_filtering.filter_database}"
+#end if
+</command>
+<inputs>
+<param format="fastq,fastq.gz" type="data" name="A" label="reads in fastq format" />
+<conditional name="sampling">
+<param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Sequence sampling"/>
+	    <when value="false">
+<!-- do nothing here -->
+</when>
+<when value="true">
+		  <param name="sample_size" type="integer" label="Sample size(number of reads" help="How many sequence reads should be in resulting dataset" value="500000" min="0"/>
+</when>
+</conditional>
+<param type="integer" name="cut_off" label="Quality cut-off" value="10" min="0" help="see below how to correctly set quality cut-off" />
+<param type="integer" name="percent_above" label="percent above cutoff" value="95" min="0"
+help="Percent of bases in sequence that must have quality equal to / higher than cut-off value" />
+<conditional name="trimming">
+<param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim sequences"/>
+<when value="false">
+<!-- do nothing here -->
+</when>
+<when value="true">
+<param type="integer" name="trim_start" label="trimming - start position" value="1" min="1"
+help="sequences are trimmed at specified start" />
+<param type="integer" name="trim_end" label="trimming - end position" value="100" min="1"
+help="sequences are trimmed to specified end position, shorted sequences are discarded" />
+</when>
+</conditional>
+	<param name="max_n" type="integer" label="maximum Ns" help="Maximum number of Ns in sequence" value="0" min="0" max="10"/>
+<conditional name="cutadapt">
+<param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Do you want to use custom cutadapt options"/>
+	    <when value="false">
+<!-- do nothing here -->
+</when>
+<when value="true">
+		  <param name="custom_options" type="text" area="True" size="8x30"  label="Cutadapt custom options" help="Consult cutadapt for usage" value="">
+<sanitizer sanitize="False"/>
+</param>>
+</when>
+</conditional>
+<conditional name="similarity_filtering">
+	    <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/>
+	    <when value="false">
+<!-- do nothing here -->
+</when>
+<when value="true">
+		  <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in fasta format. Sequence reads which has at least 90% similarity over 90% of length to sequence in filter database will be removed. This is suitable option if you want to remove organele DNA or contamination"/>
+</when>
+</conditional>
+</inputs>
+<outputs>
+<data format="fasta" name="output" label="filtered fasta reads from datasets ${A.hid}"/>
+<data format="png" name="png_output" label="nucleotide composition after filtering of ${A.hid}"/>"
+</outputs>
+<tests>
+<test>
+<param name="A" value="ERR215189_1_part.fastq.gz" />
+<param name="max_n" value="0"/>
+<param name="cut_off" value="10" />
+<param name="percent_above" value="95" />
+<output name="output" value="single_output.fasta" />
+<output name="png_output" value="single_output.png" />
+</test>
+</tests>
+<help>
+**What it does**
+This tool is designed to perform preprocessing of fastq file. Input files can be
+in GNU zipped archive (.gz extension). Reads are filtered based on the quality,
+presence of N bases and adapters. All reads which pass the quality filter fill
+be writen into output files. If sampling is specified, only sample of sequences
+will be returned.
+Cutadapt us run with this options::
+--anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
+--anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
+--anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
+--anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
+--anywhere='CAAGCAGAAGACGGCATACGAGAT'
+--anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
+--error-rate=0.05
+--times=1 --overlap=15 --discard
+**Order of fastq files processing**
+1. Trimming (optional)
+#. Filter by quality
+#. Cutadapt filtering
+#. Sampling (optional)
+#. Interlacing two fasta files
+**Quality setting cut-off**
+To correctly set quality cut-off, you need to know how the quality is encoded in your fastq file, default
+filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::
+Default filtering cut-off
+|
+|
+V
+SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
+..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
+...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
+.................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
+LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
+!"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+|                         |    |        |                              |                     |
+33                        59   64       73                            104                   126
+0........................26...31.......40
+-5....0........9.............................40
+0........9.............................40
+3.....9.............................40
+0.2......................26...31........41
+S - Sanger        Phred+33,  raw reads typically (0, 40)
+X - Solexa        Solexa+64, raw reads typically (-5, 40)
+I - Illumina 1.3+ Phred+64,  raw reads typically (0, 40)
+J - Illumina 1.5+ Phred+64,  raw reads typically (3, 40)
+with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
+(Note: See discussion above).
+L - Illumina 1.8+ Phred+33,  raw reads typically (0, 41)
+</help>
+</tool>

Mercurial > repos > petr-novak > re_utils

comparison single_fastq_filtering.xml @ 0:a4cd8608ef6b draft