Mercurial > repos > petr-novak > re_utils
comparison paired_fastq_filtering.xml @ 0:a4cd8608ef6b draft
Uploaded
author | petr-novak |
---|---|
date | Mon, 01 Apr 2019 07:56:36 -0400 |
parents | |
children | 378565f5a875 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a4cd8608ef6b |
---|---|
1 <tool id="paired_fastq_filtering" name="Preprocessing of fastq paired-reads"> | |
2 <description> | |
3 Preprocessing of paired reads fastq files | |
4 including trimming, quality filtering, cutadapt filtering and interlacing. Broken | |
5 pairs are discarded. | |
6 </description> | |
7 <requirements> | |
8 <requirement type="package">blast</requirement> | |
9 <requirement type="package">cutadapt</requirement> | |
10 <requirement type="package">bioconductor-shortread</requirement> | |
11 <requirement type="package">r-optparse</requirement> | |
12 </requirements> | |
13 <command interpreter="bash"> | |
14 paired_fastq_filtering_wrapper.sh -a ${A} -b ${B} -o ${paired} -c ${cut_off} -p ${percent_above} -N ${max_n} $rename -G ${png_output} | |
15 | |
16 #if $sampling.sequence_sampling : | |
17 -n $sampling.sample_size | |
18 #end if | |
19 | |
20 #if $trimming.sequence_trimming : | |
21 -e $trimming.trim_end -s $trimming.trim_start | |
22 #end if | |
23 | |
24 #if $cutadapt.use_custom : | |
25 -C "${cutadapt.custom_options}" | |
26 #end if | |
27 | |
28 #if $similarity_filtering.include : | |
29 -F "${similarity_filtering.filter_database}" | |
30 #end if | |
31 | |
32 </command> | |
33 | |
34 <inputs> | |
35 <param format="fastq,fastq.gz" type="data" name="A" label="Left-hand reads" /> | |
36 | |
37 <param format="fastq,fastq.gz" type="data" name="B" label="Right-hand reads" /> | |
38 | |
39 <conditional name="sampling"> | |
40 <param name="sequence_sampling" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Sequence sampling"/> | |
41 <when value="false"> | |
42 <!-- do nothing here --> | |
43 </when> | |
44 <when value="true"> | |
45 <param name="sample_size" type="integer" label="Sample size(number of pairs)" help="How many sequence pairs should be in resulting dataset" value="500000" min="0"/> | |
46 </when> | |
47 </conditional> | |
48 | |
49 <param type="integer" name="cut_off" label="Quality cut-off" value="10" min="0" help="see below how to correctly set quality cut-off" /> | |
50 <param type="integer" name="percent_above" label="percent above cutoff" value="95" min="0" | |
51 help="Percent of bases in sequence that must have quality equal to / higher than cut-off value" /> | |
52 | |
53 <conditional name="trimming"> | |
54 <param name="sequence_trimming" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Trim sequences"/> | |
55 <when value="false"> | |
56 <!-- do nothing here --> | |
57 </when> | |
58 <when value="true"> | |
59 <param type="integer" name="trim_start" label="trimming - start position" value="1" min="1" | |
60 help="sequences are trimmed at specified start" /> | |
61 <param type="integer" name="trim_end" label="trimming - end position" value="100" min="1" | |
62 help="sequences are trimmed to specified end position, shorted sequences are discarded" /> | |
63 </when> | |
64 | |
65 </conditional> | |
66 <param name="max_n" type="integer" label="maximum Ns" help="Maximum number of Ns in sequence" value="0" min="0" max="10"/> | |
67 | |
68 <conditional name="cutadapt"> | |
69 <param name="use_custom" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Do you want to use custom cutadapt options"/> | |
70 <when value="false"> | |
71 <!-- do nothing here --> | |
72 </when> | |
73 <when value="true"> | |
74 <param name="custom_options" type="text" area="True" size="8x30" label="Cutadapt custom options" help="Consult cutadapt for usage" value=""> | |
75 <sanitizer sanitize="False"/> | |
76 </param>> | |
77 </when> | |
78 </conditional> | |
79 | |
80 <conditional name="similarity_filtering"> | |
81 <param name="include" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use similarity search filtering"/> | |
82 <when value="false"> | |
83 <!-- do nothing here --> | |
84 </when> | |
85 <when value="true"> | |
86 | |
87 <param name="filter_database" format="fasta" type="data" label="Sequence filter database" help="Provide DNA sequences in fasta format. Sequence reads which has at least 90% similarity over 90% of length to sequence in filter database will be removed. This is suitable option if you want to remove organele DNA or contamination"/> | |
88 </when> | |
89 </conditional> | |
90 | |
91 <param name="rename" type="boolean" truevalue="-R" falsevalue="" checked="False" label="Rename sequences" help="By default, original sequence ID are used, in case your sequences do not follow proper naming scheme to label paired-end read mate, use this option. All read pairs must be complete!"/> | |
92 </inputs> | |
93 | |
94 | |
95 <outputs> | |
96 <data format="fasta" name="paired" label="Interlaced paired reads from datasets ${A.hid} and ${B.hid} "/> | |
97 <data format="png" name="png_output" label="nucleotide composition after filtering of ${A.hid} and ${B.hid} "/>" | |
98 </outputs> | |
99 | |
100 | |
101 <tests> | |
102 <test> | |
103 <param name="A" value="ERR215189_1_part.fastq.gz" /> | |
104 <param name="B" value="ERR215189_2_part.fastq.gz" /> | |
105 <param name="max_n" value="0"/> | |
106 <param name="cut_off" value="10" /> | |
107 <param name="percent_above" value="95" /> | |
108 <output name="output" value="paired_output.fasta" /> | |
109 <output name="png_output" value="paired_output.png" /> | |
110 </test> | |
111 </tests> | |
112 | |
113 <help> | |
114 **What it does** | |
115 | |
116 This tool is designed to make memory efficient preprocessing of two | |
117 fastq files. Output of this file can be used as input of RepeatExplorer clustering. | |
118 Input files can be in GNU zipped archive (.gz extension). | |
119 Reads are filtered based on the quality, presence of N bases and | |
120 adapters. Two input fastq files are procesed in parallel. Only complete pair | |
121 are kept. As the input files are process in chunks, it is required that | |
122 pair reads are complete and in the same order in both input files. All | |
123 reads which pass the quality filter fill be writen into output files. | |
124 If sampling is specified, only sample of sequences will be | |
125 returned. Cutadapt us run with this options:: | |
126 | |
127 --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' | |
128 --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' | |
129 --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' | |
130 --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' | |
131 --anywhere='CAAGCAGAAGACGGCATACGAGAT' | |
132 --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' | |
133 --error-rate=0.05 | |
134 --times=1 --overlap=15 --discard | |
135 | |
136 | |
137 **Order of fastq files processing** | |
138 | |
139 1. Trimming (optional) | |
140 #. Filter by quality | |
141 #. Discard single reads, keep complete pairs | |
142 #. Cutadapt filtering | |
143 #. Discard single reads, keep complete pairs | |
144 #. Sampling (optional) | |
145 #. Interlacing two fasta files | |
146 | |
147 **Quality setting cut-off** | |
148 | |
149 To correctly set quality cut-off, you need to know how the quality is encoded in your fastq file, default | |
150 filtering which is suitable for Sanger and Illumina 1.8 encoding is shown below:: | |
151 | |
152 | |
153 Default filtering cut-off | |
154 | | |
155 | | |
156 V | |
157 SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS..................................................... | |
158 ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX...................... | |
159 ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...................... | |
160 .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ...................... | |
161 LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL.................................................... | |
162 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ | |
163 | | | | | | | |
164 33 59 64 73 104 126 | |
165 0........................26...31.......40 | |
166 -5....0........9.............................40 | |
167 0........9.............................40 | |
168 3.....9.............................40 | |
169 0.2......................26...31........41 | |
170 | |
171 S - Sanger Phred+33, raw reads typically (0, 40) | |
172 X - Solexa Solexa+64, raw reads typically (-5, 40) | |
173 I - Illumina 1.3+ Phred+64, raw reads typically (0, 40) | |
174 J - Illumina 1.5+ Phred+64, raw reads typically (3, 40) | |
175 with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold) | |
176 (Note: See discussion above). | |
177 L - Illumina 1.8+ Phred+33, raw reads typically (0, 41) | |
178 | |
179 </help> | |
180 </tool> | |
181 |