9
|
1 <tool id="paired_fastq_filtering" name="Preprocessing of FASTQ paired-end reads">
|
5
|
2 <stdio>
|
|
3 <exit_code range="1:" level="fatal" description="Error" />
|
|
4 </stdio>
|
0
|
5 <description>
|
9
|
6 Preprocessing of paired-end reads in FASTQ format
|
0
|
7 including trimming, quality filtering, cutadapt filtering and interlacing. Broken
|
|
8 pairs are discarded.
|
|
9 </description>
|
|
10 <requirements>
|
|
11 <requirement type="package">blast</requirement>
|
|
12 <requirement type="package">cutadapt</requirement>
|
|
13 <requirement type="package">bioconductor-shortread</requirement>
|
|
14 <requirement type="package">r-optparse</requirement>
|
|
15 </requirements>
|
|
16 <command interpreter="bash">
|
|
17 paired_fastq_filtering_wrapper.sh -a ${A} -b ${B} -o ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output}
|
|
18
|
|
19 #if $sampling.sequence_sampling :
|
|
20 -n $sampling.sample_size
|
|
21 #end if
|
|
22
|
|
23 #if $trimming.sequence_trimming :
|
|
24 -e $trimming.trim_end -s $trimming.trim_start
|
|
25 #end if
|
|
26
|
|
27 #if $cutadapt.use_custom :
|
|
28 -C "${cutadapt.custom_options}"
|
|
29 #end if
|
|
30
|
|
31 #if $similarity_filtering.include :
|
|
32 -F "${similarity_filtering.filter_database}"
|
|
33 #end if
|
|
34
|
|
35 </command>
|
|
36
|
|
37 <inputs>
|
|
38 <param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads" />
|
|
39
|
|
40 <param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads" />
|
|
41
|
|
42 <conditional name="sampling">
|
9
|
43 <param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Read sampling"/>
|
0
|
44 <when value="false">
|
|
45 <!-- do nothing here -->
|
|
46 </when>
|
|
47 <when value="true">
|
9
|
48 <param name="sample_size" type="integer" label="Sample size (number of pairs)" help="How many read pairs should be sampled" value="500000" min="0"/>
|
0
|
49 </when>
|
|
50 </conditional>
|
|
51
|
10
|
52 <param type="integer" name="cut_off" label="Quality cutoff" value="10" min="0" help="See below how to correctly set the quality cutoff" />
|
9
|
53 <param type="integer" name="percent_above" label="Percent above cutoff" value="95" min="0"
|
10
|
54 help="Percentage of bases in the read that must have quality equal to or higher than the cutoff value" />
|
0
|
55
|
|
56 <conditional name="trimming">
|
9
|
57 <param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim reads"/>
|
0
|
58 <when value="false">
|
|
59 <!-- do nothing here -->
|
|
60 </when>
|
|
61 <when value="true">
|
9
|
62 <param type="integer" name="trim_start" label="Start position" value="1" min="1"
|
|
63 help="Reads are trimmed at the specified start" />
|
|
64 <param type="integer" name="trim_end" label="End position" value="100" min="1"
|
|
65 help="Reads are trimmed to the specified end position, shorted sequences are discarded" />
|
0
|
66 </when>
|
|
67
|
|
68 </conditional>
|
9
|
69 <param name="max_n" type="integer" label="Maximum Ns" help="Maximal number of Ns allowed in reads" value="0" min="0" max="10"/>
|
0
|
70
|
|
71 <conditional name="cutadapt">
|
9
|
72 <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Custom cutadapt options"/>
|
0
|
73 <when value="false">
|
|
74 <!-- do nothing here -->
|
|
75 </when>
|
|
76 <when value="true">
|
9
|
77 <param name="custom_options" type="text" area="True" size="8x30" label="Custom options" help="Consult cutadapt for usage" value="">
|
0
|
78 <sanitizer sanitize="False"/>
|
|
79 </param>>
|
|
80 </when>
|
|
81 </conditional>
|
|
82
|
|
83 <conditional name="similarity_filtering">
|
|
84 <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/>
|
|
85 <when value="false">
|
|
86 <!-- do nothing here -->
|
|
87 </when>
|
|
88 <when value="true">
|
|
89
|
9
|
90 <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in FASTA format. Reads that have at least 90% similarity over 90% of their length to sequence in the filter database will be removed. This option is suitable for removing organellar or other contaminating sequences."/>
|
0
|
91 </when>
|
|
92 </conditional>
|
|
93
|
9
|
94 <param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="True" label="Rename reads" help="By default, original read names are used. In case your reads do not follow proper naming scheme to label paired-end mates, use this option. All read pairs must be complete!"/>
|
0
|
95 </inputs>
|
|
96
|
|
97
|
|
98 <outputs>
|
|
99 <data format="fasta" name="paired" label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/>
|
9
|
100 <data format="png" name="png_output" label="Nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>"
|
0
|
101 </outputs>
|
|
102
|
|
103
|
|
104 <tests>
|
|
105 <test>
|
|
106 <param name="A" value="ERR215189_1_part.fastq.gz" />
|
|
107 <param name="B" value="ERR215189_2_part.fastq.gz" />
|
|
108 <param name="max_n" value="0"/>
|
|
109 <param name="cut_off" value="10" />
|
|
110 <param name="percent_above" value="95" />
|
|
111 <output name="output" value="paired_output.fasta" />
|
|
112 <output name="png_output" value="paired_output.png" />
|
|
113 </test>
|
|
114 </tests>
|
|
115
|
|
116 <help>
|
|
117 **What it does**
|
|
118
|
|
119 This tool is designed to make memory efficient preprocessing of two
|
|
120 fastq files. Output of this file can be used as input of RepeatExplorer clustering.
|
|
121 Input files can be in GNU zipped archive (.gz extension).
|
|
122 Reads are filtered based on the quality, presence of N bases and
|
|
123 adapters. Two input fastq files are procesed in parallel. Only complete pair
|
|
124 are kept. As the input files are process in chunks, it is required that
|
|
125 pair reads are complete and in the same order in both input files. All
|
|
126 reads which pass the quality filter fill be writen into output files.
|
|
127 If sampling is specified, only sample of sequences will be
|
|
128 returned. Cutadapt us run with this options::
|
|
129
|
|
130 --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
|
|
131 --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
|
|
132 --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
|
|
133 --anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
|
|
134 --anywhere='CAAGCAGAAGACGGCATACGAGAT'
|
|
135 --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
|
|
136 --error-rate=0.05
|
|
137 --times=1 --overlap=15 --discard
|
|
138
|
|
139
|
|
140 **Order of fastq files processing**
|
|
141
|
|
142 1. Trimming (optional)
|
|
143 #. Filter by quality
|
|
144 #. Discard single reads, keep complete pairs
|
|
145 #. Cutadapt filtering
|
|
146 #. Discard single reads, keep complete pairs
|
|
147 #. Sampling (optional)
|
|
148 #. Interlacing two fasta files
|
|
149
|
10
|
150 **Quality setting cutoff**
|
0
|
151
|
10
|
152 To correctly set quality cutoff, you need to know how the quality is encoded in your fastq file, default
|
0
|
153 filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::
|
|
154
|
|
155
|
10
|
156 Default filtering cutoff
|
0
|
157 |
|
|
158 |
|
|
159 V
|
|
160 SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
|
|
161 ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
|
|
162 ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
|
|
163 .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
|
|
164 LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
|
|
165 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
|
|
166 | | | | | |
|
|
167 33 59 64 73 104 126
|
|
168 0........................26...31.......40
|
|
169 -5....0........9.............................40
|
|
170 0........9.............................40
|
|
171 3.....9.............................40
|
|
172 0.2......................26...31........41
|
|
173
|
|
174 S - Sanger Phred+33, raw reads typically (0, 40)
|
|
175 X - Solexa Solexa+64, raw reads typically (-5, 40)
|
|
176 I - Illumina 1.3+ Phred+64, raw reads typically (0, 40)
|
|
177 J - Illumina 1.5+ Phred+64, raw reads typically (3, 40)
|
|
178 with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
|
|
179 (Note: See discussion above).
|
|
180 L - Illumina 1.8+ Phred+33, raw reads typically (0, 41)
|
|
181
|
|
182 </help>
|
|
183 </tool>
|
|
184
|