annotate single_fastq_filtering.xml @ 9:c2c69c6090f0 draft

Uploaded
author petr-novak
date Fri, 31 Jan 2020 06:55:23 -0500
parents 378565f5a875
children 768883847008
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
1 <tool id="single_fastq_filtering" name="Preprocessing of FASTQ reads">
5
378565f5a875 Uploaded
petr-novak
parents: 0
diff changeset
2 <stdio>
378565f5a875 Uploaded
petr-novak
parents: 0
diff changeset
3 <exit_code range="1:" level="fatal" description="Error" />
378565f5a875 Uploaded
petr-novak
parents: 0
diff changeset
4 </stdio>
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
5 <description>
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
6 Preprocessing of FASTQ read files
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
7 including trimming, quality filtering, cutadapt filtering and sampling
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
8 </description>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
9 <requirements>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
10 <requirement type="package">blast</requirement>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
11 <requirement type="package">cutadapt</requirement>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
12 <requirement type="package">bioconductor-shortread</requirement>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
13 <requirement type="package">r-optparse</requirement>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
14 </requirements>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
15 <command interpreter="bash">
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
16 single_fastq_filtering_wrapper.sh -a ${A} -o ${output} -c ${cut_off} -p ${percent_above} -N ${max_n} -G ${png_output}
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
17
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
18 #if $sampling.sequence_sampling :
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
19 -n $sampling.sample_size
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
20 #end if
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
21
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
22 #if $trimming.sequence_trimming :
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
23 -e $trimming.trim_end -s $trimming.trim_start
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
24 #end if
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
25
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
26 #if $cutadapt.use_custom :
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
27 -C "${cutadapt.custom_options}"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
28 #end if
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
29
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
30 #if $similarity_filtering.include :
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
31 -F "${similarity_filtering.filter_database}"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
32 #end if
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
33
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
34
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
35 </command>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
36
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
37 <inputs>
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
38 <param format="fastq,fastq.gz" type="data" name="A" label="Reads in FASTQ format" />
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
39 <conditional name="sampling">
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
40 <param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Read sampling"/>
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
41 <when value="false">
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
42 <!-- do nothing here -->
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
43 </when>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
44 <when value="true">
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
45 <param name="sample_size" type="integer" label="Sample size (number of reads)" help="How many reads should be sampled" value="500000" min="0"/>
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
46 </when>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
47 </conditional>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
48
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
49 <param type="integer" name="cut_off" label="Quality cut-off" value="10" min="0" help="See below how to correctly set the quality cut-off" />
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
50 <param type="integer" name="percent_above" label="Percent above cutoff" value="95" min="0"
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
51 help="Percentage of bases in the read that must have quality equal to or higher than the cut-off value" />
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
52
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
53 <conditional name="trimming">
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
54 <param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim reads"/>
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
55 <when value="false">
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
56 <!-- do nothing here -->
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
57 </when>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
58 <when value="true">
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
59 <param type="integer" name="trim_start" label="Start position" value="1" min="1"
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
60 help="Reads are trimmed at the specified start" />
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
61 <param type="integer" name="trim_end" label="End position" value="100" min="1"
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
62 help="Reads are trimmed to the specified end position, shorted sequences are discarded" />
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
63 </when>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
64
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
65 </conditional>
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
66 <param name="max_n" type="integer" label="maximum Ns" help="Maximal number of Ns allowed in reads" value="0" min="0" max="10"/>
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
67
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
68 <conditional name="cutadapt">
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
69 <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Custom cutadapt options"/>
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
70 <when value="false">
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
71 <!-- do nothing here -->
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
72 </when>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
73 <when value="true">
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
74 <param name="custom_options" type="text" area="True" size="8x30" label="Custom options" help="Consult cutadapt for usage" value="">
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
75 <sanitizer sanitize="False"/>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
76 </param>>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
77 </when>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
78 </conditional>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
79
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
80 <conditional name="similarity_filtering">
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
81 <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
82 <when value="false">
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
83 <!-- do nothing here -->
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
84 </when>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
85 <when value="true">
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
86
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
87 <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in FASTA format. Reads that have at least 90% similarity over 90% of their length to sequence in the filter database will be removed. This option is suitable for removing organellar or other contaminating sequences."/>
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
88 </when>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
89 </conditional>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
90
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
91 </inputs>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
92
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
93
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
94 <outputs>
9
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
95 <data format="fasta" name="output" label="Filtered FASTA reads from datasets ${A.hid}"/>
c2c69c6090f0 Uploaded
petr-novak
parents: 5
diff changeset
96 <data format="png" name="png_output" label="Nucleotide composition after filtering of ${A.hid}"/>"
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
97 </outputs>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
98
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
99 <tests>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
100 <test>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
101 <param name="A" value="ERR215189_1_part.fastq.gz" />
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
102 <param name="max_n" value="0"/>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
103 <param name="cut_off" value="10" />
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
104 <param name="percent_above" value="95" />
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
105 <output name="output" value="single_output.fasta" />
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
106 <output name="png_output" value="single_output.png" />
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
107 </test>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
108 </tests>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
109
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
110 <help>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
111 **What it does**
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
112
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
113 This tool is designed to perform preprocessing of fastq file. Input files can be
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
114 in GNU zipped archive (.gz extension). Reads are filtered based on the quality,
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
115 presence of N bases and adapters. All reads which pass the quality filter fill
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
116 be writen into output files. If sampling is specified, only sample of sequences
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
117 will be returned.
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
118
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
119 Cutadapt us run with this options::
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
120
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
121 --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
122 --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
123 --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
124 --anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
125 --anywhere='CAAGCAGAAGACGGCATACGAGAT'
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
126 --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
127 --error-rate=0.05
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
128 --times=1 --overlap=15 --discard
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
129
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
130
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
131 **Order of fastq files processing**
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
132
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
133 1. Trimming (optional)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
134 #. Filter by quality
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
135 #. Cutadapt filtering
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
136 #. Sampling (optional)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
137 #. Interlacing two fasta files
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
138
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
139 **Quality setting cut-off**
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
140
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
141 To correctly set quality cut-off, you need to know how the quality is encoded in your fastq file, default
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
142 filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
143
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
144
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
145 Default filtering cut-off
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
146 |
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
147 |
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
148 V
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
149 SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
150 ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
151 ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
152 .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
153 LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
154 !"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
155 | | | | | |
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
156 33 59 64 73 104 126
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
157 0........................26...31.......40
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
158 -5....0........9.............................40
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
159 0........9.............................40
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
160 3.....9.............................40
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
161 0.2......................26...31........41
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
162
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
163 S - Sanger Phred+33, raw reads typically (0, 40)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
164 X - Solexa Solexa+64, raw reads typically (-5, 40)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
165 I - Illumina 1.3+ Phred+64, raw reads typically (0, 40)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
166 J - Illumina 1.5+ Phred+64, raw reads typically (3, 40)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
167 with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
168 (Note: See discussion above).
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
169 L - Illumina 1.8+ Phred+33, raw reads typically (0, 41)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
170
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
171 </help>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
172 </tool>
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
173