Mercurial > repos > petr-novak > re_utils
comparison paired_fastq_filtering.xml @ 22:58807b35777a draft
planemo upload commit 20bdf879b52796d3fb251a20807191ff02084d3c-dirty
author | petr-novak |
---|---|
date | Wed, 02 Aug 2023 11:31:12 +0000 |
parents | 768883847008 |
children | 36c418bca8b2 |
comparison
equal
deleted
inserted
replaced
21:f4ed6a65a2ff | 22:58807b35777a |
---|---|
1 <tool id="paired_fastq_filtering" name="Preprocessing of FASTQ paired-end reads"> | 1 <tool id="paired_fastq_filtering" name="Preprocessing of FASTQ paired-end reads"> |
2 <stdio> | 2 <stdio> |
3 <exit_code range="1:" level="fatal" description="Error" /> | 3 <exit_code range="1:" level="fatal" description="Error" version="1.0.0.3"/> |
4 </stdio> | 4 </stdio> |
5 <description> | 5 <description> |
6 Preprocessing of paired-end reads in FASTQ format | 6 Preprocessing of paired-end reads in FASTQ format |
7 including trimming, quality filtering, cutadapt filtering and interlacing. Broken | 7 including trimming, quality filtering, cutadapt filtering and interlacing. Broken |
8 pairs are discarded. | 8 pairs are discarded. |
9 </description> | 9 </description> |
10 <requirements> | 10 <requirements> |
11 <requirement type="package">blast</requirement> | 11 <requirement type="package">blast</requirement> |
12 <requirement type="package">cutadapt</requirement> | 12 <requirement type="package">cutadapt</requirement> |
13 <requirement type="package">bioconductor-shortread</requirement> | 13 <requirement type="package">bioconductor-shortread</requirement> |
14 <requirement type="package">r-optparse</requirement> | 14 <requirement type="package">r-optparse</requirement> |
15 </requirements> | 15 </requirements> |
16 <command interpreter="bash"> | 16 <required_files> |
17 paired_fastq_filtering_wrapper.sh -a ${A} -b ${B} -o ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output} | 17 <include type="literal" path="paired_fastq_filtering_wrapper.sh"/> |
18 | 18 <include type="literal" path="paired_fastq_filtering.R"/> |
19 #if $sampling.sequence_sampling : | 19 <include type="literal" path="fasta_interlacer.py"/> |
20 -n $sampling.sample_size | 20 </required_files> |
21 #end if | 21 <command> |
22 | 22 bash '$__tool_directory__'/paired_fastq_filtering_wrapper.sh -a ${A} -b ${B} -o |
23 #if $trimming.sequence_trimming : | 23 ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output} |
24 -e $trimming.trim_end -s $trimming.trim_start | 24 |
25 #end if | 25 #if $sampling.sequence_sampling : |
26 | 26 -n $sampling.sample_size |
27 #if $cutadapt.use_custom : | 27 #end if |
28 -C "${cutadapt.custom_options}" | 28 |
29 #end if | 29 #if $trimming.sequence_trimming : |
30 | 30 -e $trimming.trim_end -s $trimming.trim_start |
31 #if $similarity_filtering.include : | 31 #end if |
32 -F "${similarity_filtering.filter_database}" | 32 |
33 #end if | 33 #if $cutadapt.use_custom : |
34 | 34 -C "${cutadapt.custom_options}" |
35 </command> | 35 #end if |
36 | 36 |
37 <inputs> | 37 #if $similarity_filtering.include : |
38 <param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads" /> | 38 -F "${similarity_filtering.filter_database}" |
39 | 39 #end if |
40 <param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads" /> | 40 |
41 | 41 </command> |
42 <conditional name="sampling"> | 42 |
43 <param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Read sampling"/> | 43 <inputs> |
44 <when value="false"> | 44 <param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads"/> |
45 <!-- do nothing here --> | 45 |
46 </when> | 46 <param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads"/> |
47 <when value="true"> | 47 |
48 <param name="sample_size" type="integer" label="Sample size (number of pairs)" help="How many read pairs should be sampled" value="500000" min="0"/> | 48 <conditional name="sampling"> |
49 </when> | 49 <param name="sequence_sampling" type="boolean" truevalue="true" |
50 </conditional> | 50 falsevalue="false" checked="False" label="Read sampling"/> |
51 | 51 <when value="false"> |
52 <param type="integer" name="cut_off" label="Quality cutoff" value="10" min="0" help="See below how to correctly set the quality cutoff" /> | 52 <!-- do nothing here --> |
53 <param type="integer" name="percent_above" label="Percent above cutoff" value="95" min="0" | 53 </when> |
54 help="Percentage of bases in the read that must have quality equal to or higher than the cutoff value" /> | 54 <when value="true"> |
55 | 55 <param name="sample_size" type="integer" |
56 <conditional name="trimming"> | 56 label="Sample size (number of pairs)" |
57 <param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim reads"/> | 57 help="How many read pairs should be sampled" value="500000" |
58 <when value="false"> | 58 min="0"/> |
59 <!-- do nothing here --> | 59 </when> |
60 </when> | 60 </conditional> |
61 <when value="true"> | 61 |
62 <param type="integer" name="trim_start" label="Start position" value="1" min="1" | 62 <param type="integer" name="cut_off" label="Quality cutoff" value="10" min="0" |
63 help="Reads are trimmed at the specified start" /> | 63 help="See below how to correctly set the quality cutoff"/> |
64 <param type="integer" name="trim_end" label="End position" value="100" min="1" | 64 <param type="integer" name="percent_above" label="Percent above cutoff" value="95" |
65 help="Reads are trimmed to the specified end position, shorted sequences are discarded" /> | 65 min="0" |
66 </when> | 66 help="Percentage of bases in the read that must have quality equal to or higher than the cutoff value"/> |
67 | 67 |
68 </conditional> | 68 <conditional name="trimming"> |
69 <param name="max_n" type="integer" label="Maximum Ns" help="Maximal number of Ns allowed in reads" value="0" min="0" max="10"/> | 69 <param name="sequence_trimming" type="boolean" truevalue="true" |
70 | 70 falsevalue="false" checked="False" label="Trim reads"/> |
71 <conditional name="cutadapt"> | 71 <when value="false"> |
72 <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Custom cutadapt options"/> | 72 <!-- do nothing here --> |
73 <when value="false"> | 73 </when> |
74 <!-- do nothing here --> | 74 <when value="true"> |
75 </when> | 75 <param type="integer" name="trim_start" label="Start position" value="1" |
76 <when value="true"> | 76 min="1" |
77 <param name="custom_options" type="text" area="True" size="8x30" label="Custom options" help="Consult cutadapt for usage" value=""> | 77 help="Reads are trimmed at the specified start"/> |
78 <sanitizer sanitize="False"/> | 78 <param type="integer" name="trim_end" label="End position" value="100" |
79 </param>> | 79 min="1" |
80 </when> | 80 help="Reads are trimmed to the specified end position, shorted sequences are discarded"/> |
81 </conditional> | 81 </when> |
82 | 82 |
83 <conditional name="similarity_filtering"> | 83 </conditional> |
84 <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/> | 84 <param name="max_n" type="integer" label="Maximum Ns" |
85 <when value="false"> | 85 help="Maximal number of Ns allowed in reads" value="0" min="0" max="10"/> |
86 <!-- do nothing here --> | 86 |
87 </when> | 87 <conditional name="cutadapt"> |
88 <when value="true"> | 88 <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" |
89 | 89 checked="False" label="Custom cutadapt options"/> |
90 <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in FASTA format. Reads that have at least 90% similarity over 90% of their length to sequence in the filter database will be removed. This option is suitable for removing organellar or other contaminating sequences."/> | 90 <when value="false"> |
91 </when> | 91 <!-- do nothing here --> |
92 </conditional> | 92 </when> |
93 | 93 <when value="true"> |
94 <param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="True" label="Rename reads" help="By default, original read names are used. In case your reads do not follow proper naming scheme to label paired-end mates, use this option. All read pairs must be complete!"/> | 94 <param name="custom_options" type="text" area="True" size="8x30" |
95 </inputs> | 95 label="Custom options" help="Consult cutadapt for usage" value=""> |
96 | 96 <sanitizer sanitize="False"/> |
97 | 97 </param> |
98 <outputs> | 98 > |
99 <data format="fasta" name="paired" label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/> | 99 </when> |
100 <data format="png" name="png_output" label="Nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>" | 100 </conditional> |
101 </outputs> | 101 |
102 | 102 <conditional name="similarity_filtering"> |
103 | 103 <param name="include" type="boolean" truevalue="true" falsevalue="false" |
104 <tests> | 104 checked="False" label="Use similarity search filtering"/> |
105 <test> | 105 <when value="false"> |
106 <param name="A" value="ERR215189_1_part.fastq.gz" /> | 106 <!-- do nothing here --> |
107 <param name="B" value="ERR215189_2_part.fastq.gz" /> | 107 </when> |
108 <param name="max_n" value="0"/> | 108 <when value="true"> |
109 <param name="cut_off" value="10" /> | 109 |
110 <param name="percent_above" value="95" /> | 110 <param name="filter_database" format="fasta" type="data" |
111 <output name="output" value="paired_output.fasta" /> | 111 label="Sequence filter database" |
112 <output name="png_output" value="paired_output.png" /> | 112 help="Provide DNA sequences in FASTA format. Reads that have at least 90% similarity over 90% of their length to sequence in the filter database will be removed. This option is suitable for removing organellar or other contaminating sequences."/> |
113 </test> | 113 </when> |
114 </tests> | 114 </conditional> |
115 | 115 |
116 <help> | 116 <param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="True" |
117 **What it does** | 117 label="Rename reads" |
118 | 118 help="By default, original read names are used. In case your reads do not follow proper naming scheme to label paired-end mates, use this option. All read pairs must be complete!"/> |
119 This tool is designed to make memory efficient preprocessing of two | 119 </inputs> |
120 fastq files. Output of this file can be used as input of RepeatExplorer clustering. | 120 |
121 Input files can be in GNU zipped archive (.gz extension). | 121 |
122 Reads are filtered based on the quality, presence of N bases and | 122 <outputs> |
123 adapters. Two input fastq files are procesed in parallel. Only complete pair | 123 <data format="fasta" name="paired" |
124 are kept. As the input files are process in chunks, it is required that | 124 label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/> |
125 pair reads are complete and in the same order in both input files. All | 125 <data format="png" name="png_output" |
126 reads which pass the quality filter fill be writen into output files. | 126 label="Nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>" |
127 If sampling is specified, only sample of sequences will be | 127 </outputs> |
128 returned. Cutadapt us run with this options:: | 128 |
129 | 129 |
130 --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' | 130 <tests> |
131 --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' | 131 <test> |
132 --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' | 132 <param name="A" value="ERR215189_1_part.fastq.gz"/> |
133 --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' | 133 <param name="B" value="ERR215189_2_part.fastq.gz"/> |
134 --anywhere='CAAGCAGAAGACGGCATACGAGAT' | 134 <param name="max_n" value="0"/> |
135 --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' | 135 <param name="cut_off" value="10"/> |
136 --error-rate=0.05 | 136 <param name="percent_above" value="95"/> |
137 --times=1 --overlap=15 --discard | 137 <output name="output" value="paired_output.fasta"/> |
138 | 138 <output name="png_output" value="paired_output.png"/> |
139 | 139 </test> |
140 **Order of fastq files processing** | 140 </tests> |
141 | 141 |
142 1. Trimming (optional) | 142 <help> |
143 #. Filter by quality | 143 **What it does** |
144 #. Discard single reads, keep complete pairs | 144 |
145 #. Cutadapt filtering | 145 This tool is designed to make memory efficient preprocessing of two |
146 #. Discard single reads, keep complete pairs | 146 fastq files. Output of this file can be used as input of RepeatExplorer |
147 #. Sampling (optional) | 147 clustering. |
148 #. Interlacing two fasta files | 148 Input files can be in GNU zipped archive (.gz extension). |
149 | 149 Reads are filtered based on the quality, presence of N bases and |
150 **Quality setting cutoff** | 150 adapters. Two input fastq files are procesed in parallel. Only complete pair |
151 | 151 are kept. As the input files are process in chunks, it is required that |
152 To correctly set quality cutoff, you need to know how the quality is encoded in your fastq file, default | 152 pair reads are complete and in the same order in both input files. All |
153 filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below:: | 153 reads which pass the quality filter fill be writen into output files. |
154 | 154 If sampling is specified, only sample of sequences will be |
155 | 155 returned. Cutadapt us run with this options:: |
156 Default filtering cutoff | 156 |
157 | | 157 --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' |
158 | | 158 --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' |
159 V | 159 --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' |
160 SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS..................................................... | 160 --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' |
161 ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...................... | 161 --anywhere='CAAGCAGAAGACGGCATACGAGAT' |
162 ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...................... | 162 --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' |
163 .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ...................... | 163 --error-rate=0.05 |
164 LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL.................................................... | 164 --times=1 --overlap=15 --discard |
165 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ | 165 |
166 | | | | | | | 166 |
167 33 59 64 73 104 126 | 167 **Order of fastq files processing** |
168 0........................26...31.......40 | 168 |
169 -5....0........9.............................40 | 169 1. Trimming (optional) |
170 0........9.............................40 | 170 #. Filter by quality |
171 3.....9.............................40 | 171 #. Discard single reads, keep complete pairs |
172 0.2......................26...31........41 | 172 #. Cutadapt filtering |
173 | 173 #. Discard single reads, keep complete pairs |
174 S - Sanger Phred+33, raw reads typically (0, 40) | 174 #. Sampling (optional) |
175 X - Solexa Solexa+64, raw reads typically (-5, 40) | 175 #. Interlacing two fasta files |
176 I - Illumina 1.3+ Phred+64, raw reads typically (0, 40) | 176 |
177 J - Illumina 1.5+ Phred+64, raw reads typically (3, 40) | 177 **Quality setting cutoff** |
178 with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold) | 178 |
179 (Note: See discussion above). | 179 To correctly set quality cutoff, you need to know how the quality is encoded in |
180 L - Illumina 1.8+ Phred+33, raw reads typically (0, 41) | 180 your fastq file, default |
181 | 181 filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below:: |
182 </help> | 182 |
183 | |
184 Default filtering cutoff | |
185 | | |
186 | | |
187 V | |
188 SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS..................................................... | |
189 ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...................... | |
190 ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...................... | |
191 .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ...................... | |
192 LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL.................................................... | |
193 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ | |
194 | | | | | | | |
195 33 59 64 73 104 126 | |
196 0........................26...31.......40 | |
197 -5....0........9.............................40 | |
198 0........9.............................40 | |
199 3.....9.............................40 | |
200 0.2......................26...31........41 | |
201 | |
202 S - Sanger Phred+33, raw reads typically (0, 40) | |
203 X - Solexa Solexa+64, raw reads typically (-5, 40) | |
204 I - Illumina 1.3+ Phred+64, raw reads typically (0, 40) | |
205 J - Illumina 1.5+ Phred+64, raw reads typically (3, 40) | |
206 with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold) | |
207 (Note: See discussion above). | |
208 L - Illumina 1.8+ Phred+33, raw reads typically (0, 41) | |
209 | |
210 </help> | |
183 </tool> | 211 </tool> |
184 | 212 |