0
|
1 <tool id="single_fastq_filtering" name="Preprocessing of fastq reads">
|
|
2 <description>
|
|
3 Preprocessing of fastq files
|
|
4 including trimming, quality filtering, cutadapt filtering and sampling
|
|
5 </description>
|
|
6 <requirements>
|
|
7 <requirement type="package">blast</requirement>
|
|
8 <requirement type="package">cutadapt</requirement>
|
|
9 <requirement type="package">bioconductor-shortread</requirement>
|
|
10 <requirement type="package">r-optparse</requirement>
|
|
11 </requirements>
|
|
12 <command interpreter="bash">
|
|
13 single_fastq_filtering_wrapper.sh -a ${A} -o ${output} -c ${cut_off} -p ${percent_above} -N ${max_n} -G ${png_output}
|
|
14
|
|
15 #if $sampling.sequence_sampling :
|
|
16 -n $sampling.sample_size
|
|
17 #end if
|
|
18
|
|
19 #if $trimming.sequence_trimming :
|
|
20 -e $trimming.trim_end -s $trimming.trim_start
|
|
21 #end if
|
|
22
|
|
23 #if $cutadapt.use_custom :
|
|
24 -C "${cutadapt.custom_options}"
|
|
25 #end if
|
|
26
|
|
27 #if $similarity_filtering.include :
|
|
28 -F "${similarity_filtering.filter_database}"
|
|
29 #end if
|
|
30
|
|
31
|
|
32 </command>
|
|
33
|
|
34 <inputs>
|
|
35 <param format="fastq,fastq.gz" type="data" name="A" label="reads in fastq format" />
|
|
36 <conditional name="sampling">
|
|
37 <param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Sequence sampling"/>
|
|
38 <when value="false">
|
|
39 <!-- do nothing here -->
|
|
40 </when>
|
|
41 <when value="true">
|
|
42 <param name="sample_size" type="integer" label="Sample size(number of reads" help="How many sequence reads should be in resulting dataset" value="500000" min="0"/>
|
|
43 </when>
|
|
44 </conditional>
|
|
45
|
|
46 <param type="integer" name="cut_off" label="Quality cut-off" value="10" min="0" help="see below how to correctly set quality cut-off" />
|
|
47 <param type="integer" name="percent_above" label="percent above cutoff" value="95" min="0"
|
|
48 help="Percent of bases in sequence that must have quality equal to / higher than cut-off value" />
|
|
49
|
|
50 <conditional name="trimming">
|
|
51 <param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim sequences"/>
|
|
52 <when value="false">
|
|
53 <!-- do nothing here -->
|
|
54 </when>
|
|
55 <when value="true">
|
|
56 <param type="integer" name="trim_start" label="trimming - start position" value="1" min="1"
|
|
57 help="sequences are trimmed at specified start" />
|
|
58 <param type="integer" name="trim_end" label="trimming - end position" value="100" min="1"
|
|
59 help="sequences are trimmed to specified end position, shorted sequences are discarded" />
|
|
60 </when>
|
|
61
|
|
62 </conditional>
|
|
63 <param name="max_n" type="integer" label="maximum Ns" help="Maximum number of Ns in sequence" value="0" min="0" max="10"/>
|
|
64
|
|
65 <conditional name="cutadapt">
|
|
66 <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Do you want to use custom cutadapt options"/>
|
|
67 <when value="false">
|
|
68 <!-- do nothing here -->
|
|
69 </when>
|
|
70 <when value="true">
|
|
71 <param name="custom_options" type="text" area="True" size="8x30" label="Cutadapt custom options" help="Consult cutadapt for usage" value="">
|
|
72 <sanitizer sanitize="False"/>
|
|
73 </param>>
|
|
74 </when>
|
|
75 </conditional>
|
|
76
|
|
77 <conditional name="similarity_filtering">
|
|
78 <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/>
|
|
79 <when value="false">
|
|
80 <!-- do nothing here -->
|
|
81 </when>
|
|
82 <when value="true">
|
|
83
|
|
84 <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in fasta format. Sequence reads which has at least 90% similarity over 90% of length to sequence in filter database will be removed. This is suitable option if you want to remove organele DNA or contamination"/>
|
|
85 </when>
|
|
86 </conditional>
|
|
87
|
|
88 </inputs>
|
|
89
|
|
90
|
|
91 <outputs>
|
|
92 <data format="fasta" name="output" label="filtered fasta reads from datasets ${A.hid}"/>
|
|
93 <data format="png" name="png_output" label="nucleotide composition after filtering of ${A.hid}"/>"
|
|
94 </outputs>
|
|
95
|
|
96 <tests>
|
|
97 <test>
|
|
98 <param name="A" value="ERR215189_1_part.fastq.gz" />
|
|
99 <param name="max_n" value="0"/>
|
|
100 <param name="cut_off" value="10" />
|
|
101 <param name="percent_above" value="95" />
|
|
102 <output name="output" value="single_output.fasta" />
|
|
103 <output name="png_output" value="single_output.png" />
|
|
104 </test>
|
|
105 </tests>
|
|
106
|
|
107 <help>
|
|
108 **What it does**
|
|
109
|
|
110 This tool is designed to perform preprocessing of fastq file. Input files can be
|
|
111 in GNU zipped archive (.gz extension). Reads are filtered based on the quality,
|
|
112 presence of N bases and adapters. All reads which pass the quality filter fill
|
|
113 be writen into output files. If sampling is specified, only sample of sequences
|
|
114 will be returned.
|
|
115
|
|
116 Cutadapt us run with this options::
|
|
117
|
|
118 --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
|
|
119 --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
|
|
120 --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
|
|
121 --anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
|
|
122 --anywhere='CAAGCAGAAGACGGCATACGAGAT'
|
|
123 --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
|
|
124 --error-rate=0.05
|
|
125 --times=1 --overlap=15 --discard
|
|
126
|
|
127
|
|
128 **Order of fastq files processing**
|
|
129
|
|
130 1. Trimming (optional)
|
|
131 #. Filter by quality
|
|
132 #. Cutadapt filtering
|
|
133 #. Sampling (optional)
|
|
134 #. Interlacing two fasta files
|
|
135
|
|
136 **Quality setting cut-off**
|
|
137
|
|
138 To correctly set quality cut-off, you need to know how the quality is encoded in your fastq file, default
|
|
139 filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::
|
|
140
|
|
141
|
|
142 Default filtering cut-off
|
|
143 |
|
|
144 |
|
|
145 V
|
|
146 SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
|
|
147 ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
|
|
148 ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
|
|
149 .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
|
|
150 LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
|
|
151 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
|
|
152 | | | | | |
|
|
153 33 59 64 73 104 126
|
|
154 0........................26...31.......40
|
|
155 -5....0........9.............................40
|
|
156 0........9.............................40
|
|
157 3.....9.............................40
|
|
158 0.2......................26...31........41
|
|
159
|
|
160 S - Sanger Phred+33, raw reads typically (0, 40)
|
|
161 X - Solexa Solexa+64, raw reads typically (-5, 40)
|
|
162 I - Illumina 1.3+ Phred+64, raw reads typically (0, 40)
|
|
163 J - Illumina 1.5+ Phred+64, raw reads typically (3, 40)
|
|
164 with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
|
|
165 (Note: See discussion above).
|
|
166 L - Illumina 1.8+ Phred+33, raw reads typically (0, 41)
|
|
167
|
|
168 </help>
|
|
169 </tool>
|
|
170
|