comparison single_fastq_filtering.xml @ 0:a4cd8608ef6b draft

Uploaded
author petr-novak
date Mon, 01 Apr 2019 07:56:36 -0400
parents
children 378565f5a875
comparison
equal deleted inserted replaced
-1:000000000000 0:a4cd8608ef6b
1 <tool id="single_fastq_filtering" name="Preprocessing of fastq reads">
2 <description>
3 Preprocessing of fastq files
4 including trimming, quality filtering, cutadapt filtering and sampling
5 </description>
6 <requirements>
7 <requirement type="package">blast</requirement>
8 <requirement type="package">cutadapt</requirement>
9 <requirement type="package">bioconductor-shortread</requirement>
10 <requirement type="package">r-optparse</requirement>
11 </requirements>
12 <command interpreter="bash">
13 single_fastq_filtering_wrapper.sh -a ${A} -o ${output} -c ${cut_off} -p ${percent_above} -N ${max_n} -G ${png_output}
14
15 #if $sampling.sequence_sampling :
16 -n $sampling.sample_size
17 #end if
18
19 #if $trimming.sequence_trimming :
20 -e $trimming.trim_end -s $trimming.trim_start
21 #end if
22
23 #if $cutadapt.use_custom :
24 -C "${cutadapt.custom_options}"
25 #end if
26
27 #if $similarity_filtering.include :
28 -F "${similarity_filtering.filter_database}"
29 #end if
30
31
32 </command>
33
34 <inputs>
35 <param format="fastq,fastq.gz" type="data" name="A" label="reads in fastq format" />
36 <conditional name="sampling">
37 <param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Sequence sampling"/>
38 <when value="false">
39 <!-- do nothing here -->
40 </when>
41 <when value="true">
42 <param name="sample_size" type="integer" label="Sample size(number of reads" help="How many sequence reads should be in resulting dataset" value="500000" min="0"/>
43 </when>
44 </conditional>
45
46 <param type="integer" name="cut_off" label="Quality cut-off" value="10" min="0" help="see below how to correctly set quality cut-off" />
47 <param type="integer" name="percent_above" label="percent above cutoff" value="95" min="0"
48 help="Percent of bases in sequence that must have quality equal to / higher than cut-off value" />
49
50 <conditional name="trimming">
51 <param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim sequences"/>
52 <when value="false">
53 <!-- do nothing here -->
54 </when>
55 <when value="true">
56 <param type="integer" name="trim_start" label="trimming - start position" value="1" min="1"
57 help="sequences are trimmed at specified start" />
58 <param type="integer" name="trim_end" label="trimming - end position" value="100" min="1"
59 help="sequences are trimmed to specified end position, shorted sequences are discarded" />
60 </when>
61
62 </conditional>
63 <param name="max_n" type="integer" label="maximum Ns" help="Maximum number of Ns in sequence" value="0" min="0" max="10"/>
64
65 <conditional name="cutadapt">
66 <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Do you want to use custom cutadapt options"/>
67 <when value="false">
68 <!-- do nothing here -->
69 </when>
70 <when value="true">
71 <param name="custom_options" type="text" area="True" size="8x30" label="Cutadapt custom options" help="Consult cutadapt for usage" value="">
72 <sanitizer sanitize="False"/>
73 </param>>
74 </when>
75 </conditional>
76
77 <conditional name="similarity_filtering">
78 <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/>
79 <when value="false">
80 <!-- do nothing here -->
81 </when>
82 <when value="true">
83
84 <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in fasta format. Sequence reads which has at least 90% similarity over 90% of length to sequence in filter database will be removed. This is suitable option if you want to remove organele DNA or contamination"/>
85 </when>
86 </conditional>
87
88 </inputs>
89
90
91 <outputs>
92 <data format="fasta" name="output" label="filtered fasta reads from datasets ${A.hid}"/>
93 <data format="png" name="png_output" label="nucleotide composition after filtering of ${A.hid}"/>"
94 </outputs>
95
96 <tests>
97 <test>
98 <param name="A" value="ERR215189_1_part.fastq.gz" />
99 <param name="max_n" value="0"/>
100 <param name="cut_off" value="10" />
101 <param name="percent_above" value="95" />
102 <output name="output" value="single_output.fasta" />
103 <output name="png_output" value="single_output.png" />
104 </test>
105 </tests>
106
107 <help>
108 **What it does**
109
110 This tool is designed to perform preprocessing of fastq file. Input files can be
111 in GNU zipped archive (.gz extension). Reads are filtered based on the quality,
112 presence of N bases and adapters. All reads which pass the quality filter fill
113 be writen into output files. If sampling is specified, only sample of sequences
114 will be returned.
115
116 Cutadapt us run with this options::
117
118 --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
119 --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
120 --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
121 --anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
122 --anywhere='CAAGCAGAAGACGGCATACGAGAT'
123 --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
124 --error-rate=0.05
125 --times=1 --overlap=15 --discard
126
127
128 **Order of fastq files processing**
129
130 1. Trimming (optional)
131 #. Filter by quality
132 #. Cutadapt filtering
133 #. Sampling (optional)
134 #. Interlacing two fasta files
135
136 **Quality setting cut-off**
137
138 To correctly set quality cut-off, you need to know how the quality is encoded in your fastq file, default
139 filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::
140
141
142 Default filtering cut-off
143 |
144 |
145 V
146 SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
147 ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
148 ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
149 .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
150 LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
151 !"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
152 | | | | | |
153 33 59 64 73 104 126
154 0........................26...31.......40
155 -5....0........9.............................40
156 0........9.............................40
157 3.....9.............................40
158 0.2......................26...31........41
159
160 S - Sanger Phred+33, raw reads typically (0, 40)
161 X - Solexa Solexa+64, raw reads typically (-5, 40)
162 I - Illumina 1.3+ Phred+64, raw reads typically (0, 40)
163 J - Illumina 1.5+ Phred+64, raw reads typically (3, 40)
164 with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
165 (Note: See discussion above).
166 L - Illumina 1.8+ Phred+33, raw reads typically (0, 41)
167
168 </help>
169 </tool>
170