comparison paired_fastq_filtering.xml @ 22:58807b35777a draft

planemo upload commit 20bdf879b52796d3fb251a20807191ff02084d3c-dirty
author petr-novak
date Wed, 02 Aug 2023 11:31:12 +0000
parents 768883847008
children 36c418bca8b2
comparison
equal deleted inserted replaced
21:f4ed6a65a2ff 22:58807b35777a
1 <tool id="paired_fastq_filtering" name="Preprocessing of FASTQ paired-end reads"> 1 <tool id="paired_fastq_filtering" name="Preprocessing of FASTQ paired-end reads">
2 <stdio> 2 <stdio>
3 <exit_code range="1:" level="fatal" description="Error" /> 3 <exit_code range="1:" level="fatal" description="Error" version="1.0.0.3"/>
4 </stdio> 4 </stdio>
5 <description> 5 <description>
6 Preprocessing of paired-end reads in FASTQ format 6 Preprocessing of paired-end reads in FASTQ format
7 including trimming, quality filtering, cutadapt filtering and interlacing. Broken 7 including trimming, quality filtering, cutadapt filtering and interlacing. Broken
8 pairs are discarded. 8 pairs are discarded.
9 </description> 9 </description>
10 <requirements> 10 <requirements>
11 <requirement type="package">blast</requirement> 11 <requirement type="package">blast</requirement>
12 <requirement type="package">cutadapt</requirement> 12 <requirement type="package">cutadapt</requirement>
13 <requirement type="package">bioconductor-shortread</requirement> 13 <requirement type="package">bioconductor-shortread</requirement>
14 <requirement type="package">r-optparse</requirement> 14 <requirement type="package">r-optparse</requirement>
15 </requirements> 15 </requirements>
16 <command interpreter="bash"> 16 <required_files>
17 paired_fastq_filtering_wrapper.sh -a ${A} -b ${B} -o ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output} 17 <include type="literal" path="paired_fastq_filtering_wrapper.sh"/>
18 18 <include type="literal" path="paired_fastq_filtering.R"/>
19 #if $sampling.sequence_sampling : 19 <include type="literal" path="fasta_interlacer.py"/>
20 -n $sampling.sample_size 20 </required_files>
21 #end if 21 <command>
22 22 bash '$__tool_directory__'/paired_fastq_filtering_wrapper.sh -a ${A} -b ${B} -o
23 #if $trimming.sequence_trimming : 23 ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output}
24 -e $trimming.trim_end -s $trimming.trim_start 24
25 #end if 25 #if $sampling.sequence_sampling :
26 26 -n $sampling.sample_size
27 #if $cutadapt.use_custom : 27 #end if
28 -C "${cutadapt.custom_options}" 28
29 #end if 29 #if $trimming.sequence_trimming :
30 30 -e $trimming.trim_end -s $trimming.trim_start
31 #if $similarity_filtering.include : 31 #end if
32 -F "${similarity_filtering.filter_database}" 32
33 #end if 33 #if $cutadapt.use_custom :
34 34 -C "${cutadapt.custom_options}"
35 </command> 35 #end if
36 36
37 <inputs> 37 #if $similarity_filtering.include :
38 <param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads" /> 38 -F "${similarity_filtering.filter_database}"
39 39 #end if
40 <param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads" /> 40
41 41 </command>
42 <conditional name="sampling"> 42
43 <param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Read sampling"/> 43 <inputs>
44 <when value="false"> 44 <param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads"/>
45 <!-- do nothing here --> 45
46 </when> 46 <param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads"/>
47 <when value="true"> 47
48 <param name="sample_size" type="integer" label="Sample size (number of pairs)" help="How many read pairs should be sampled" value="500000" min="0"/> 48 <conditional name="sampling">
49 </when> 49 <param name="sequence_sampling" type="boolean" truevalue="true"
50 </conditional> 50 falsevalue="false" checked="False" label="Read sampling"/>
51 51 <when value="false">
52 <param type="integer" name="cut_off" label="Quality cutoff" value="10" min="0" help="See below how to correctly set the quality cutoff" /> 52 <!-- do nothing here -->
53 <param type="integer" name="percent_above" label="Percent above cutoff" value="95" min="0" 53 </when>
54 help="Percentage of bases in the read that must have quality equal to or higher than the cutoff value" /> 54 <when value="true">
55 55 <param name="sample_size" type="integer"
56 <conditional name="trimming"> 56 label="Sample size (number of pairs)"
57 <param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim reads"/> 57 help="How many read pairs should be sampled" value="500000"
58 <when value="false"> 58 min="0"/>
59 <!-- do nothing here --> 59 </when>
60 </when> 60 </conditional>
61 <when value="true"> 61
62 <param type="integer" name="trim_start" label="Start position" value="1" min="1" 62 <param type="integer" name="cut_off" label="Quality cutoff" value="10" min="0"
63 help="Reads are trimmed at the specified start" /> 63 help="See below how to correctly set the quality cutoff"/>
64 <param type="integer" name="trim_end" label="End position" value="100" min="1" 64 <param type="integer" name="percent_above" label="Percent above cutoff" value="95"
65 help="Reads are trimmed to the specified end position, shorted sequences are discarded" /> 65 min="0"
66 </when> 66 help="Percentage of bases in the read that must have quality equal to or higher than the cutoff value"/>
67 67
68 </conditional> 68 <conditional name="trimming">
69 <param name="max_n" type="integer" label="Maximum Ns" help="Maximal number of Ns allowed in reads" value="0" min="0" max="10"/> 69 <param name="sequence_trimming" type="boolean" truevalue="true"
70 70 falsevalue="false" checked="False" label="Trim reads"/>
71 <conditional name="cutadapt"> 71 <when value="false">
72 <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Custom cutadapt options"/> 72 <!-- do nothing here -->
73 <when value="false"> 73 </when>
74 <!-- do nothing here --> 74 <when value="true">
75 </when> 75 <param type="integer" name="trim_start" label="Start position" value="1"
76 <when value="true"> 76 min="1"
77 <param name="custom_options" type="text" area="True" size="8x30" label="Custom options" help="Consult cutadapt for usage" value=""> 77 help="Reads are trimmed at the specified start"/>
78 <sanitizer sanitize="False"/> 78 <param type="integer" name="trim_end" label="End position" value="100"
79 </param>> 79 min="1"
80 </when> 80 help="Reads are trimmed to the specified end position, shorted sequences are discarded"/>
81 </conditional> 81 </when>
82 82
83 <conditional name="similarity_filtering"> 83 </conditional>
84 <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/> 84 <param name="max_n" type="integer" label="Maximum Ns"
85 <when value="false"> 85 help="Maximal number of Ns allowed in reads" value="0" min="0" max="10"/>
86 <!-- do nothing here --> 86
87 </when> 87 <conditional name="cutadapt">
88 <when value="true"> 88 <param name="use_custom" type="boolean" truevalue="true" falsevalue="false"
89 89 checked="False" label="Custom cutadapt options"/>
90 <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in FASTA format. Reads that have at least 90% similarity over 90% of their length to sequence in the filter database will be removed. This option is suitable for removing organellar or other contaminating sequences."/> 90 <when value="false">
91 </when> 91 <!-- do nothing here -->
92 </conditional> 92 </when>
93 93 <when value="true">
94 <param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="True" label="Rename reads" help="By default, original read names are used. In case your reads do not follow proper naming scheme to label paired-end mates, use this option. All read pairs must be complete!"/> 94 <param name="custom_options" type="text" area="True" size="8x30"
95 </inputs> 95 label="Custom options" help="Consult cutadapt for usage" value="">
96 96 <sanitizer sanitize="False"/>
97 97 </param>
98 <outputs> 98 >
99 <data format="fasta" name="paired" label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/> 99 </when>
100 <data format="png" name="png_output" label="Nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>" 100 </conditional>
101 </outputs> 101
102 102 <conditional name="similarity_filtering">
103 103 <param name="include" type="boolean" truevalue="true" falsevalue="false"
104 <tests> 104 checked="False" label="Use similarity search filtering"/>
105 <test> 105 <when value="false">
106 <param name="A" value="ERR215189_1_part.fastq.gz" /> 106 <!-- do nothing here -->
107 <param name="B" value="ERR215189_2_part.fastq.gz" /> 107 </when>
108 <param name="max_n" value="0"/> 108 <when value="true">
109 <param name="cut_off" value="10" /> 109
110 <param name="percent_above" value="95" /> 110 <param name="filter_database" format="fasta" type="data"
111 <output name="output" value="paired_output.fasta" /> 111 label="Sequence filter database"
112 <output name="png_output" value="paired_output.png" /> 112 help="Provide DNA sequences in FASTA format. Reads that have at least 90% similarity over 90% of their length to sequence in the filter database will be removed. This option is suitable for removing organellar or other contaminating sequences."/>
113 </test> 113 </when>
114 </tests> 114 </conditional>
115 115
116 <help> 116 <param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="True"
117 **What it does** 117 label="Rename reads"
118 118 help="By default, original read names are used. In case your reads do not follow proper naming scheme to label paired-end mates, use this option. All read pairs must be complete!"/>
119 This tool is designed to make memory efficient preprocessing of two 119 </inputs>
120 fastq files. Output of this file can be used as input of RepeatExplorer clustering. 120
121 Input files can be in GNU zipped archive (.gz extension). 121
122 Reads are filtered based on the quality, presence of N bases and 122 <outputs>
123 adapters. Two input fastq files are procesed in parallel. Only complete pair 123 <data format="fasta" name="paired"
124 are kept. As the input files are process in chunks, it is required that 124 label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/>
125 pair reads are complete and in the same order in both input files. All 125 <data format="png" name="png_output"
126 reads which pass the quality filter fill be writen into output files. 126 label="Nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>"
127 If sampling is specified, only sample of sequences will be 127 </outputs>
128 returned. Cutadapt us run with this options:: 128
129 129
130 --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' 130 <tests>
131 --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' 131 <test>
132 --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' 132 <param name="A" value="ERR215189_1_part.fastq.gz"/>
133 --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' 133 <param name="B" value="ERR215189_2_part.fastq.gz"/>
134 --anywhere='CAAGCAGAAGACGGCATACGAGAT' 134 <param name="max_n" value="0"/>
135 --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' 135 <param name="cut_off" value="10"/>
136 --error-rate=0.05 136 <param name="percent_above" value="95"/>
137 --times=1 --overlap=15 --discard 137 <output name="output" value="paired_output.fasta"/>
138 138 <output name="png_output" value="paired_output.png"/>
139 139 </test>
140 **Order of fastq files processing** 140 </tests>
141 141
142 1. Trimming (optional) 142 <help>
143 #. Filter by quality 143 **What it does**
144 #. Discard single reads, keep complete pairs 144
145 #. Cutadapt filtering 145 This tool is designed to make memory efficient preprocessing of two
146 #. Discard single reads, keep complete pairs 146 fastq files. Output of this file can be used as input of RepeatExplorer
147 #. Sampling (optional) 147 clustering.
148 #. Interlacing two fasta files 148 Input files can be in GNU zipped archive (.gz extension).
149 149 Reads are filtered based on the quality, presence of N bases and
150 **Quality setting cutoff** 150 adapters. Two input fastq files are procesed in parallel. Only complete pair
151 151 are kept. As the input files are process in chunks, it is required that
152 To correctly set quality cutoff, you need to know how the quality is encoded in your fastq file, default 152 pair reads are complete and in the same order in both input files. All
153 filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below:: 153 reads which pass the quality filter fill be writen into output files.
154 154 If sampling is specified, only sample of sequences will be
155 155 returned. Cutadapt us run with this options::
156 Default filtering cutoff 156
157 | 157 --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
158 | 158 --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
159 V 159 --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
160 SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS..................................................... 160 --anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
161 ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...................... 161 --anywhere='CAAGCAGAAGACGGCATACGAGAT'
162 ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...................... 162 --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
163 .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ...................... 163 --error-rate=0.05
164 LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL.................................................... 164 --times=1 --overlap=15 --discard
165 !"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ 165
166 | | | | | | 166
167 33 59 64 73 104 126 167 **Order of fastq files processing**
168 0........................26...31.......40 168
169 -5....0........9.............................40 169 1. Trimming (optional)
170 0........9.............................40 170 #. Filter by quality
171 3.....9.............................40 171 #. Discard single reads, keep complete pairs
172 0.2......................26...31........41 172 #. Cutadapt filtering
173 173 #. Discard single reads, keep complete pairs
174 S - Sanger Phred+33, raw reads typically (0, 40) 174 #. Sampling (optional)
175 X - Solexa Solexa+64, raw reads typically (-5, 40) 175 #. Interlacing two fasta files
176 I - Illumina 1.3+ Phred+64, raw reads typically (0, 40) 176
177 J - Illumina 1.5+ Phred+64, raw reads typically (3, 40) 177 **Quality setting cutoff**
178 with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold) 178
179 (Note: See discussion above). 179 To correctly set quality cutoff, you need to know how the quality is encoded in
180 L - Illumina 1.8+ Phred+33, raw reads typically (0, 41) 180 your fastq file, default
181 181 filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::
182 </help> 182
183
184 Default filtering cutoff
185 |
186 |
187 V
188 SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
189 ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
190 ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
191 .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
192 LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
193 !"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
194 | | | | | |
195 33 59 64 73 104 126
196 0........................26...31.......40
197 -5....0........9.............................40
198 0........9.............................40
199 3.....9.............................40
200 0.2......................26...31........41
201
202 S - Sanger Phred+33, raw reads typically (0, 40)
203 X - Solexa Solexa+64, raw reads typically (-5, 40)
204 I - Illumina 1.3+ Phred+64, raw reads typically (0, 40)
205 J - Illumina 1.5+ Phred+64, raw reads typically (3, 40)
206 with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
207 (Note: See discussion above).
208 L - Illumina 1.8+ Phred+33, raw reads typically (0, 41)
209
210 </help>
183 </tool> 211 </tool>
184 212