comparison paired_fastq_filtering.xml @ 0:a4cd8608ef6b draft

Uploaded
author petr-novak
date Mon, 01 Apr 2019 07:56:36 -0400
parents
children 378565f5a875
comparison
equal deleted inserted replaced
-1:000000000000 0:a4cd8608ef6b
1 <tool id="paired_fastq_filtering" name="Preprocessing of fastq paired-reads">
2 <description>
3 Preprocessing of paired reads fastq files
4 including trimming, quality filtering, cutadapt filtering and interlacing. Broken
5 pairs are discarded.
6 </description>
7 <requirements>
8 <requirement type="package">blast</requirement>
9 <requirement type="package">cutadapt</requirement>
10 <requirement type="package">bioconductor-shortread</requirement>
11 <requirement type="package">r-optparse</requirement>
12 </requirements>
13 <command interpreter="bash">
14 paired_fastq_filtering_wrapper.sh -a ${A} -b ${B} -o ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output}
15
16 #if $sampling.sequence_sampling :
17 -n $sampling.sample_size
18 #end if
19
20 #if $trimming.sequence_trimming :
21 -e $trimming.trim_end -s $trimming.trim_start
22 #end if
23
24 #if $cutadapt.use_custom :
25 -C "${cutadapt.custom_options}"
26 #end if
27
28 #if $similarity_filtering.include :
29 -F "${similarity_filtering.filter_database}"
30 #end if
31
32 </command>
33
34 <inputs>
35 <param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads" />
36
37 <param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads" />
38
39 <conditional name="sampling">
40 <param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Sequence sampling"/>
41 <when value="false">
42 <!-- do nothing here -->
43 </when>
44 <when value="true">
45 <param name="sample_size" type="integer" label="Sample size(number of pairs)" help="How many sequence pairs should be in resulting dataset" value="500000" min="0"/>
46 </when>
47 </conditional>
48
49 <param type="integer" name="cut_off" label="Quality cut-off" value="10" min="0" help="see below how to correctly set quality cut-off" />
50 <param type="integer" name="percent_above" label="percent above cutoff" value="95" min="0"
51 help="Percent of bases in sequence that must have quality equal to / higher than cut-off value" />
52
53 <conditional name="trimming">
54 <param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim sequences"/>
55 <when value="false">
56 <!-- do nothing here -->
57 </when>
58 <when value="true">
59 <param type="integer" name="trim_start" label="trimming - start position" value="1" min="1"
60 help="sequences are trimmed at specified start" />
61 <param type="integer" name="trim_end" label="trimming - end position" value="100" min="1"
62 help="sequences are trimmed to specified end position, shorted sequences are discarded" />
63 </when>
64
65 </conditional>
66 <param name="max_n" type="integer" label="maximum Ns" help="Maximum number of Ns in sequence" value="0" min="0" max="10"/>
67
68 <conditional name="cutadapt">
69 <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Do you want to use custom cutadapt options"/>
70 <when value="false">
71 <!-- do nothing here -->
72 </when>
73 <when value="true">
74 <param name="custom_options" type="text" area="True" size="8x30" label="Cutadapt custom options" help="Consult cutadapt for usage" value="">
75 <sanitizer sanitize="False"/>
76 </param>>
77 </when>
78 </conditional>
79
80 <conditional name="similarity_filtering">
81 <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/>
82 <when value="false">
83 <!-- do nothing here -->
84 </when>
85 <when value="true">
86
87 <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in fasta format. Sequence reads which has at least 90% similarity over 90% of length to sequence in filter database will be removed. This is suitable option if you want to remove organele DNA or contamination"/>
88 </when>
89 </conditional>
90
91 <param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="False" label="Rename sequences" help="By default, original sequence ID are used, in case your sequences do not follow proper naming scheme to label paired-end read mate, use this option. All read pairs must be complete!"/>
92 </inputs>
93
94
95 <outputs>
96 <data format="fasta" name="paired" label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/>
97 <data format="png" name="png_output" label="nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>"
98 </outputs>
99
100
101 <tests>
102 <test>
103 <param name="A" value="ERR215189_1_part.fastq.gz" />
104 <param name="B" value="ERR215189_2_part.fastq.gz" />
105 <param name="max_n" value="0"/>
106 <param name="cut_off" value="10" />
107 <param name="percent_above" value="95" />
108 <output name="output" value="paired_output.fasta" />
109 <output name="png_output" value="paired_output.png" />
110 </test>
111 </tests>
112
113 <help>
114 **What it does**
115
116 This tool is designed to make memory efficient preprocessing of two
117 fastq files. Output of this file can be used as input of RepeatExplorer clustering.
118 Input files can be in GNU zipped archive (.gz extension).
119 Reads are filtered based on the quality, presence of N bases and
120 adapters. Two input fastq files are procesed in parallel. Only complete pair
121 are kept. As the input files are process in chunks, it is required that
122 pair reads are complete and in the same order in both input files. All
123 reads which pass the quality filter fill be writen into output files.
124 If sampling is specified, only sample of sequences will be
125 returned. Cutadapt us run with this options::
126
127 --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
128 --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
129 --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
130 --anywhere='ATCTCGTATGCCGTCTTCTGCTTG'
131 --anywhere='CAAGCAGAAGACGGCATACGAGAT'
132 --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC'
133 --error-rate=0.05
134 --times=1 --overlap=15 --discard
135
136
137 **Order of fastq files processing**
138
139 1. Trimming (optional)
140 #. Filter by quality
141 #. Discard single reads, keep complete pairs
142 #. Cutadapt filtering
143 #. Discard single reads, keep complete pairs
144 #. Sampling (optional)
145 #. Interlacing two fasta files
146
147 **Quality setting cut-off**
148
149 To correctly set quality cut-off, you need to know how the quality is encoded in your fastq file, default
150 filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below::
151
152
153 Default filtering cut-off
154 |
155 |
156 V
157 SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
158 ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
159 ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
160 .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
161 LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
162 !"#$%&amp;'()*+,-./0123456789:;&lt;=&gt;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
163 | | | | | |
164 33 59 64 73 104 126
165 0........................26...31.......40
166 -5....0........9.............................40
167 0........9.............................40
168 3.....9.............................40
169 0.2......................26...31........41
170
171 S - Sanger Phred+33, raw reads typically (0, 40)
172 X - Solexa Solexa+64, raw reads typically (-5, 40)
173 I - Illumina 1.3+ Phred+64, raw reads typically (0, 40)
174 J - Illumina 1.5+ Phred+64, raw reads typically (3, 40)
175 with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
176 (Note: See discussion above).
177 L - Illumina 1.8+ Phred+33, raw reads typically (0, 41)
178
179 </help>
180 </tool>
181