Mercurial > repos > petr-novak > re_utils
annotate paired_fastq_filtering.R @ 34:91996b991991 draft default tip
Uploaded
author | petr-novak |
---|---|
date | Fri, 16 Feb 2024 15:22:21 +0000 |
parents | f1738f8649b0 |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env Rscript |
2 # TODO - add triming from start and end, removing short reads | |
3 library(optparse, quiet = TRUE) | |
4 matricesSum = function(m1,m2){ | |
5 mfin = matrix(0, nrow = nrow(m2), ncol = max(ncol(m1), ncol(m2))) | |
6 rownames(mfin) = rownames(m2) | |
7 if (sum(m1)>0){ | |
8 mfin[,1:ncol(m1)] = mfin[,1:ncol(m1)] + m1 | |
9 } | |
10 if (sum(m2)>0){ | |
11 mfin[,1:ncol(m2)] = mfin[,1:ncol(m2)] + m2 | |
12 } | |
13 | |
14 return(mfin) | |
15 } | |
16 megablast = function(fx,params="-F \"m D\" -D 4 -p 90 ",database){ | |
17 # assume blastn and makeblastdb in $PATH | |
18 tmp_in = tempfile() | |
19 tmp_out = tempfile() | |
20 writeFasta(fx,file=tmp_in) | |
21 ## check if db is present: | |
22 dbfiles = paste0(database, ".nhr") | |
23 if (!file.exists(dbfiles)){ | |
24 cat("running makeblastdb\n") | |
25 system(paste0("makeblastdb -dbtype nucl -in ", database)) | |
26 } | |
27 params = '-num_alignments 1 -num_threads 4 -outfmt "6 qseqid qcovs pident" ' | |
28 cmd=paste("blastn", " -db", database, "-query", tmp_in, | |
29 "-out", tmp_out, " ", params) | |
30 status=system(cmd,intern=TRUE) | |
31 if (file.info(tmp_out)$size != 0){ | |
32 results=read.table(tmp_out,sep="\t",as.is=TRUE,comment.char="") | |
33 colnames(results) = c("qseqid", "qcovs", "pident") | |
34 }else{ | |
35 results=NULL | |
36 } | |
37 unlink(tmp_in) | |
38 unlink(tmp_out) | |
39 return(results) | |
40 } | |
41 | |
42 | |
43 plotFrequencies = function(x){ | |
44 par(xaxs = 'i', yaxs = 'i') | |
45 plot(NULL, xlim = c(1,ncol(x)), ylim = c(0,1),type ='n', xlab="position", ylab = "proportion") | |
46 col = c(A="red", "T"="orange", C="blue", G="green") | |
47 for (n in c("A","C", "T","G")){ | |
48 points(x[n,]/colSums(x), type = 'l', col = col[n]) | |
49 } | |
50 grid(ny = 50, nx = ncol(x) - 1 ) | |
51 legend("topright", col = col, lwd = 2, legend = names(col)) | |
52 } | |
53 | |
54 option_list = list(make_option(c("-a", "--fastqA"), action = "store", type = "character", | |
55 help = "fastq file A", default = NA), make_option(c("-b", "--fastqB"), action = "store", | |
56 type = "character", help = "fastq file B", default = NA), make_option(c("-x", | |
57 "--fastqX"), action = "store", type = "character", help = "output fastq file X", | |
58 default = NA), make_option(c("-y", "--fastqY"), action = "store", type = "character", | |
59 help = "output fastq file Y", default = NA), make_option(c("-c", "--cut_off"), | |
60 action = "store", type = "numeric", help = "Quality cut-off value [default %default]", | |
61 default = 10), make_option(c("-r", "--rnd_seed"), action = "store", type = "numeric", | |
62 help = "seed for random number generator [default %default]", default = 123), | |
63 make_option(c("-G", "--png_file_output"), action = "store", type = "character", default=NA), | |
64 make_option(c("-p", "--percent_above"), action = "store", type = "numeric", help = "Percent of bases in sequence that must have quality equal to / higher than cut-off value [default %default]", | |
65 default = 95), make_option(c("-e", "--trim_end"), action = "store", type = "numeric", | |
66 help = "trimming - end position [no trimming by default]", default = NA), | |
67 make_option(c("-s", "--trim_start"), action = "store", type = "numeric", help = "triming position - start [no trimming by default]", | |
68 default = NA), make_option(c("-n", "--sample_size"), action = "store", type = "numeric", | |
69 help = "requested sample size (number of pairs)[no sampling by default]", | |
70 default = NULL), make_option(c("-N", "--Max_N"), action = "store", type = "integer", | |
71 help = "maximum number of Ns in sequences [default %default]", default = 0), | |
72 make_option(c("-R", "--rename"), action = "store_true", type = "logical", help = "Rename sequences", | |
73 default = FALSE), make_option(c("-f", "--format"), action = "store", type = "character", | |
74 help = "format of output - fastq or fasta [default %default] ", default = "fasta"), | |
75 make_option(c("-C", "--cutadapt_options"), action = "store", type = "character", | |
76 help = "file specifying cutadapt options", default = NULL), make_option(c("-j", | |
77 "--chunk_size"), action = "store", type = "numeric", help = "Number of sequences processed in single step. This option affect speed of processing and memory usage [default %default]", | |
5 | 78 default = 1000000), |
0 | 79 make_option(c('-F', '--filter_seq'),action='store',type='character',help='file specifying sequences for filtering (e.g. plastid DNA)',default=NULL) |
80 ) | |
81 | |
82 description = paste(strwrap("Script for filterinq fastq files generated from paired end sequencing | |
83 \t\t\t\t\t\t"), | |
84 collapse = "\n") | |
85 # arguments whic hcould be modifed | |
86 cutadapt_arguments = paste(" --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' ", | |
87 " --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' ", | |
88 " --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' ", | |
89 " --anywhere='CAAGCAGAAGACGGCATACGAGAT' --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' ", | |
90 "--error-rate=0.05 --times=1 --overlap=15 --discard ", " ", sep = "") | |
91 | |
92 | |
93 epilogue = paste(strwrap(c("Description:\n | |
94 This tool is designed to make memory efficient preprocessing of two fastq files. Input files can be in GNU zipped archive (.gz extension). | |
95 Reads are filtered based on the quality, presence of N bases and adapters. | |
96 Two input fastq files are procesed in parallel. Cutaddapt program must be installed to use adapter filtering. | |
97 Only complete pair are kept. As the input files are process in chunks, | |
98 it is required that pair reads are complete and in the same order in both input files. All reads which pass the quality | |
99 filter fill be writen into ouytput files. If --sample size is specified, only sample of sequences will be returned. | |
100 By default cutadapt us run with this options: | |
101 \n ", | |
102 cutadapt_arguments, "\n\nIf you want to use different adapter sequences specify them in separate file and use -C --cutadapt options. | |
103 Consult cutadapt manual for usage. ")), | |
104 collapse = "\n") | |
105 | |
106 epilogue = paste(epilogue, " | |
107 fastqA fastqB | |
108 | | | |
109 V V | |
110 ---------------------------------- | |
111 | Trimming (optional) | | |
112 ---------------------------------- | |
113 | | | |
114 | | | |
115 V V | |
116 ---------------------------------- | |
117 | Filter by quality | | |
118 ---------------------------------- | |
119 | | | |
120 | | | |
121 V V | |
122 --------------------------------- | |
123 | Discard single reads |---> | |
124 | Keep complete pairs | | |
125 --------------------------------- | |
126 | | | |
127 V V | |
128 ---------------------------------- | |
129 | cutadapt filtering | | |
130 ---------------------------------- | |
131 | | | |
132 V V | |
133 --------------------------------- | |
134 | Discard single reads |---> | |
135 | Keep complete pairs | | |
136 --------------------------------- | |
137 | | | |
138 | | | |
139 V V | |
140 ---------------------------------- | |
141 | sample (optional) | | |
142 ---------------------------------- | |
143 | | | |
144 | | | |
145 V V | |
146 FastqX FastqY | |
147 (or fastaX) (or fastaY) | |
148 | |
149 # example: | |
150 | |
151 ", | |
152 sep = "\n") | |
153 | |
154 | |
155 parser = OptionParser(option_list = option_list, epilogue = epilogue, description = description, | |
156 ) | |
157 | |
158 | |
159 | |
160 opt = parse_args(parser, args = commandArgs(TRUE)) | |
161 if (!(opt$format %in% c("fasta", "fastq"))) { | |
162 stop("wrong output format") | |
163 } | |
164 if (any(is.na(c(opt$fastqA, opt$fastqB, opt$fastqX, opt$fastqY)))) { | |
165 cat("\nInput ond output files must be specified!\n\n") | |
166 print_help(parser) | |
167 q() | |
168 } | |
169 | |
170 | |
171 if (!is.null(opt$cutadapt_options)) { | |
172 if (file.exists(opt$cutadapt_options)) { | |
173 cutadapt_arguments = scan(opt$cutadapt_options, what = "character", comment.char = "#") | |
174 | |
175 } else { | |
176 cutadapt_arguments = opt$cutadapt_options | |
177 } | |
178 ## remove 'EOF' | |
179 cutadapt_arguments = paste(" ", cutadapt_arguments, " ", collapse = " ", sep = " ") | |
180 } | |
181 | |
182 | |
183 | |
184 cutadapt_cmd = function(input, output, cutadapt_arguments) { | |
33
f1738f8649b0
planemo upload commit 39094a128ea3dd2c39f4997c6de739c33c07e5f3-dirty
petr-novak
parents:
5
diff
changeset
|
185 cmds = paste("cutadapt ", cutadapt_arguments, " ", c(input), |
0 | 186 " -o ", c(output), sep = "") |
187 } | |
188 | |
189 set.seed(opt$rnd_seed) | |
190 PROP = opt$percent_above/100 | |
191 MINQ = opt$cut_off | |
192 TRIM_END = opt$trim_end | |
193 TRIM_START = opt$trim_start | |
194 | |
195 CHUNK_SIZE = opt$chunk_size | |
196 | |
197 MIN_LENGTH = 1 + TRIM_END - ifelse(is.na(TRIM_START), 1, TRIM_START) | |
198 if (is.na(MIN_LENGTH)) { | |
199 MIN_LENGTH = 1 | |
200 } | |
201 | |
202 suppressPackageStartupMessages(library(ShortRead)) | |
203 Qfilter = srFilter(function(x) { | |
204 rowSums(as(quality(x), "matrix") > MINQ, na.rm = TRUE)/nchar(sread(x)) >= PROP | |
205 }) | |
206 | |
207 Min_Length_filter = srFilter(function(x) { | |
208 nchar(sread(x)) >= MIN_LENGTH | |
209 }) | |
210 | |
211 | |
212 | |
213 | |
214 fin_filter = compose(Qfilter, nFilter(opt$Max_N), Min_Length_filter) | |
215 | |
216 if (opt$format == "fasta") { | |
217 writeFun = writeFasta | |
218 } else { | |
219 writeFun = writeFastq | |
220 } | |
221 | |
222 f1out = opt$fastqX | |
223 f2out = opt$fastqY | |
224 # are output files empty?: | |
225 | |
226 | |
227 # find wich character specify pair: - it could be | |
228 lA = readLines(opt$fastqA, n = 1) | |
229 lB = readLines(opt$fastqB, n = 1) | |
230 pair_tag_position = which(!strsplit(lA, "")[[1]] == strsplit(lB, "")[[1]]) | |
231 | |
232 if ((length(pair_tag_position)) != 1 & !opt$rename) { | |
233 warning("unable to determine pair tag") | |
234 } | |
235 | |
236 # pair_regex=paste(substring(lA,pair_tag_position-1,pair_tag_position-1),'.+',sep='') | |
237 # assume 1/ same order 2/ all id must be same | |
238 | |
239 # check size of sequences - must be same... | |
240 n1 = countLines(opt$fastqA)/4 | |
241 | |
242 | |
243 number_of_chunks = round(n1/CHUNK_SIZE) | |
5 | 244 ## adjust chunk size to make last chunk of the same size is all other |
245 ## this is to avoid small last chunk | |
246 CHUNK_SIZE = round(n1/number_of_chunks) | |
247 | |
0 | 248 if (number_of_chunks == 0) { |
249 CHUNK_SIZE = n1 | |
250 number_of_chunks = 1 | |
251 } | |
252 if (!is.null(opt$sample_size)) { | |
5 | 253 sample_size_in_chunk = round(opt$sample_size/number_of_chunks) |
254 n_missing = opt$sample_size - sample_size_in_chunk * number_of_chunks | |
0 | 255 } else { |
5 | 256 sample_size_in_chunk = CHUNK_SIZE |
257 n_missing = 0 | |
0 | 258 } |
259 | |
5 | 260 cat("number chunks ", number_of_chunks, "\n") |
261 cat("chunks size ", CHUNK_SIZE, "\n") | |
0 | 262 # adjust the chunk size to get exact count of sequences: |
263 CHUNK_SIZE = ceiling(n1/number_of_chunks) | |
264 F_id = ifelse(opt$rename, "/1", "1") | |
265 R_id = ifelse(opt$rename, "/2", "2") | |
266 | |
267 f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE) | |
268 f2 <- FastqStreamer(opt$fastqB, CHUNK_SIZE) | |
269 total = 0 | |
270 chunk = 0 | |
271 nucleotideFrequenciesForward = nucleotideFrequenciesReverse = matrix(0) | |
272 while (TRUE) { | |
273 chunk = chunk + 1 | |
5 | 274 cat("chunk number ", chunk, "\n") |
0 | 275 fq1 <- yield(f1) |
276 fq2 <- yield(f2) | |
277 if (length(fq1) == 0) { | |
278 break | |
279 } | |
5 | 280 cat("chunk number ", chunk, " imported\n") |
281 cat("chunk size", length(fq1), "\n") | |
0 | 282 ## rename |
283 chunk_id = sprintf(paste0("%0", round(log10(number_of_chunks)) + 1, "d"), chunk) | |
284 cat("chunk id ", chunk_id, "\n") | |
285 fmt = paste0("%0", round(log10(length(fq1))) + 1, "d") | |
286 | |
287 ## either do not rename | |
288 if (opt$rename) { | |
289 ## or use index | |
290 fq1@id = fq2@id = BStringSet(paste0(chunk_id, sprintf(fmt, 1:length(fq1)))) | |
291 } else { | |
292 ## or use largest common substring | |
293 ind = mapply(function(x, y) which(x != y)[1], strsplit(as.character(id(fq1)), | |
294 split = ""), strsplit(as.character(id(fq2)), split = "")) | |
295 fq1@id = fq2@id = subseq(id(fq1), 1, ind - 1) | |
296 | |
297 } | |
298 | |
299 if (!is.na(TRIM_END)) { | |
300 fq1 = narrow(fq1, end = ifelse((TRIM_END) > width(fq1), width(fq1), TRIM_END)) | |
301 fq2 = narrow(fq2, end = ifelse((TRIM_END) > width(fq2), width(fq2), TRIM_END)) | |
302 } | |
303 if (!is.na(TRIM_START)) { | |
304 fq1 = narrow(fq1, start = ifelse((TRIM_START) < width(fq1), TRIM_START, width(fq1))) | |
305 fq2 = narrow(fq2, start = ifelse((TRIM_START) < width(fq2), TRIM_START, width(fq2))) | |
306 | |
307 } | |
308 | |
309 | |
310 fqF1 = fq1[fin_filter(fq1)] | |
311 fqF2 = fq2[fin_filter(fq2)] | |
312 | |
313 inc1 = id(fqF1) %in% id(fqF2) | |
314 inc2 = id(fqF2) %in% id(fqF1) | |
315 | |
316 cat("running cutadapt on chunk\n") | |
317 # cut addapt here: # filter parts: | |
318 tmp_in1 = tempfile(fileext = ".fastq") | |
319 tmp_out1 = tempfile(fileext = ".fastq") | |
320 tmp_in2 = tempfile(fileext = ".fastq") | |
321 tmp_out2 = tempfile(fileext = ".fastq") | |
322 | |
323 | |
324 if (is.null(formals(writeFastq)$compress)) { | |
325 writeFastq(fqF1[inc1], file = tmp_in1) | |
326 writeFastq(fqF2[inc2], file = tmp_in2) | |
327 } else { | |
328 writeFastq(fqF1[inc1], file = tmp_in1, compress = FALSE) | |
329 writeFastq(fqF2[inc2], file = tmp_in2, compress = FALSE) | |
330 } | |
331 | |
332 | |
333 cmd1 = cutadapt_cmd(tmp_in1, tmp_out1, cutadapt_arguments) | |
334 cmd2 = cutadapt_cmd(tmp_in2, tmp_out2, cutadapt_arguments) | |
335 # this should run cutadapt in parallel | |
336 | |
337 status = system(paste(cmd1, "& \n", cmd2, "&\nwait"), intern = TRUE) | |
338 # collect results | |
339 ftmp1 = FastqFile(tmp_out1) | |
340 | |
341 fqF1 = readFastq(ftmp1) | |
342 close(ftmp1) | |
343 | |
344 ftmp2 = FastqFile(tmp_out2) | |
345 fqF2 = readFastq(ftmp2) | |
346 close(ftmp2) | |
347 ## clean up | |
348 unlink(tmp_out1) | |
349 unlink(tmp_in1) | |
350 unlink(tmp_out2) | |
351 unlink(tmp_in2) | |
352 | |
353 | |
354 | |
5 | 355 ## remove sequences similar to filter database (e.g. plastid DNA) |
0 | 356 if (!is.null(opt$filter_seq)){ |
357 blast_results1 = megablast(fqF1, database=opt$filter_seq) | |
358 blast_results2 = megablast(fqF2, database=opt$filter_seq) | |
359 if (!is.null(blast_results1) & !is.null(blast_results2)){ | |
360 exclude1=with(blast_results1, unique(qseqid[pident >= 90 & qcovs >=90])) | |
361 exclude2=with(blast_results2, unique(qseqid[pident >= 90 & qcovs >=90])) | |
362 ## note - blast will truncate seq ids - anything beyond space is omitted | |
363 seq_to_exclude1 = gsub(" .*","",id(fqF1)) %in% exclude1 | |
364 seq_to_exclude2 = gsub(" .*","",id(fqF2)) %in% exclude2 | |
365 excluded_contamination1=signif(sum(seq_to_exclude1)/length(seq_to_exclude1)*100,3) | |
366 excluded_contamination2=signif(sum(seq_to_exclude2)/length(seq_to_exclude2)*100,3) | |
367 cat(excluded_contamination1,"% filtered out after blast - forward reads\n" ) | |
368 cat(excluded_contamination2,"% filtered out after blast - reverse reads\n" ) | |
369 fqF1 = fqF1[!seq_to_exclude1] | |
370 fqF2 = fqF2[!seq_to_exclude2] | |
371 } | |
372 } | |
373 | |
374 | |
375 | |
376 | |
377 # filter complete pairs again: id1=gsub(pair_regex,'',id(fqF1)) | |
378 # id2=gsub(pair_regex,'',id(fqF2)) | |
379 inc1 = id(fqF1) %in% id(fqF2) | |
380 inc2 = id(fqF2) %in% id(fqF1) | |
381 total = sum(inc1) + total | |
382 ## create new id - last character must differentiate pair - for interlacig | |
383 | |
384 fqF1@id = BStringSet(paste0(id(fqF1), F_id)) | |
385 fqF2@id = BStringSet(paste0(id(fqF2), R_id)) | |
386 | |
387 | |
5 | 388 if (sum(inc1) > (sample_size_in_chunk + n_missing)) { |
389 smp = sort(sample(sum(inc1), sample_size_in_chunk + n_missing)) | |
390 n_missing = 0 ## this was to correct rounding error | |
0 | 391 writeFun(fqF1[inc1][smp], file = f1out, mode = "a") |
392 writeFun(fqF2[inc2][smp], file = f2out, mode = "a") | |
393 nfrq1 = alphabetByCycle(sread(fqF1[inc1][smp])) | |
394 nfrq2 = alphabetByCycle(sread(fqF2[inc2][smp])) | |
395 } else { | |
396 writeFun(fqF1[inc1], file = f1out, mode = "a") | |
397 writeFun(fqF2[inc2], file = f2out, mode = "a") | |
398 nfrq1 = alphabetByCycle(sread(fqF1[inc1])) | |
399 nfrq2 = alphabetByCycle(sread(fqF2[inc2])) | |
400 } | |
401 nucleotideFrequenciesForward = matricesSum(nucleotideFrequenciesForward, nfrq1) | |
402 nucleotideFrequenciesReverse = matricesSum(nucleotideFrequenciesReverse, nfrq2) | |
403 | |
404 } | |
405 if( !is.na(opt$png_file_output)){ | |
406 png(opt$png_file_output, width = 1000, height = 1000) | |
407 par(mfrow=c(2,1)) | |
408 plotFrequencies(nucleotideFrequenciesForward) | |
409 mtext("forward reads") | |
410 plotFrequencies(nucleotideFrequenciesReverse) | |
411 mtext("reverse reads") | |
412 dev.off() | |
413 } | |
414 close(f1) | |
415 close(f2) | |
416 |