comparison paired_fastq_filtering.R @ 0:a4cd8608ef6b draft

Uploaded
author petr-novak
date Mon, 01 Apr 2019 07:56:36 -0400
parents
children 378565f5a875
comparison
equal deleted inserted replaced
-1:000000000000 0:a4cd8608ef6b
1 #!/usr/bin/env Rscript
2 # TODO - add triming from start and end, removing short reads
3 library(optparse, quiet = TRUE)
4 matricesSum = function(m1,m2){
5 mfin = matrix(0, nrow = nrow(m2), ncol = max(ncol(m1), ncol(m2)))
6 rownames(mfin) = rownames(m2)
7 if (sum(m1)>0){
8 mfin[,1:ncol(m1)] = mfin[,1:ncol(m1)] + m1
9 }
10 if (sum(m2)>0){
11 mfin[,1:ncol(m2)] = mfin[,1:ncol(m2)] + m2
12 }
13
14 return(mfin)
15 }
16 megablast = function(fx,params="-F \"m D\" -D 4 -p 90 ",database){
17 # assume blastn and makeblastdb in $PATH
18 tmp_in = tempfile()
19 tmp_out = tempfile()
20 writeFasta(fx,file=tmp_in)
21 ## check if db is present:
22 dbfiles = paste0(database, ".nhr")
23 if (!file.exists(dbfiles)){
24 cat("running makeblastdb\n")
25 system(paste0("makeblastdb -dbtype nucl -in ", database))
26 }
27 params = '-num_alignments 1 -num_threads 4 -outfmt "6 qseqid qcovs pident" '
28 cmd=paste("blastn", " -db", database, "-query", tmp_in,
29 "-out", tmp_out, " ", params)
30 status=system(cmd,intern=TRUE)
31 if (file.info(tmp_out)$size != 0){
32 results=read.table(tmp_out,sep="\t",as.is=TRUE,comment.char="")
33 colnames(results) = c("qseqid", "qcovs", "pident")
34 }else{
35 results=NULL
36 }
37 unlink(tmp_in)
38 unlink(tmp_out)
39 return(results)
40 }
41
42
43 plotFrequencies = function(x){
44 par(xaxs = 'i', yaxs = 'i')
45 plot(NULL, xlim = c(1,ncol(x)), ylim = c(0,1),type ='n', xlab="position", ylab = "proportion")
46 col = c(A="red", "T"="orange", C="blue", G="green")
47 for (n in c("A","C", "T","G")){
48 points(x[n,]/colSums(x), type = 'l', col = col[n])
49 }
50 grid(ny = 50, nx = ncol(x) - 1 )
51 legend("topright", col = col, lwd = 2, legend = names(col))
52 }
53
54 option_list = list(make_option(c("-a", "--fastqA"), action = "store", type = "character",
55 help = "fastq file A", default = NA), make_option(c("-b", "--fastqB"), action = "store",
56 type = "character", help = "fastq file B", default = NA), make_option(c("-x",
57 "--fastqX"), action = "store", type = "character", help = "output fastq file X",
58 default = NA), make_option(c("-y", "--fastqY"), action = "store", type = "character",
59 help = "output fastq file Y", default = NA), make_option(c("-c", "--cut_off"),
60 action = "store", type = "numeric", help = "Quality cut-off value [default %default]",
61 default = 10), make_option(c("-r", "--rnd_seed"), action = "store", type = "numeric",
62 help = "seed for random number generator [default %default]", default = 123),
63 make_option(c("-G", "--png_file_output"), action = "store", type = "character", default=NA),
64 make_option(c("-p", "--percent_above"), action = "store", type = "numeric", help = "Percent of bases in sequence that must have quality equal to / higher than cut-off value [default %default]",
65 default = 95), make_option(c("-e", "--trim_end"), action = "store", type = "numeric",
66 help = "trimming - end position [no trimming by default]", default = NA),
67 make_option(c("-s", "--trim_start"), action = "store", type = "numeric", help = "triming position - start [no trimming by default]",
68 default = NA), make_option(c("-n", "--sample_size"), action = "store", type = "numeric",
69 help = "requested sample size (number of pairs)[no sampling by default]",
70 default = NULL), make_option(c("-N", "--Max_N"), action = "store", type = "integer",
71 help = "maximum number of Ns in sequences [default %default]", default = 0),
72 make_option(c("-R", "--rename"), action = "store_true", type = "logical", help = "Rename sequences",
73 default = FALSE), make_option(c("-f", "--format"), action = "store", type = "character",
74 help = "format of output - fastq or fasta [default %default] ", default = "fasta"),
75 make_option(c("-C", "--cutadapt_options"), action = "store", type = "character",
76 help = "file specifying cutadapt options", default = NULL), make_option(c("-j",
77 "--chunk_size"), action = "store", type = "numeric", help = "Number of sequences processed in single step. This option affect speed of processing and memory usage [default %default]",
78 default = 1e+06),
79 make_option(c('-F', '--filter_seq'),action='store',type='character',help='file specifying sequences for filtering (e.g. plastid DNA)',default=NULL)
80 )
81
82 description = paste(strwrap("Script for filterinq fastq files generated from paired end sequencing
83 \t\t\t\t\t\t"),
84 collapse = "\n")
85 # arguments whic hcould be modifed
86 cutadapt_arguments = paste(" --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' ",
87 " --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' ",
88 " --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' ",
89 " --anywhere='CAAGCAGAAGACGGCATACGAGAT' --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' ",
90 "--error-rate=0.05 --times=1 --overlap=15 --discard ", " ", sep = "")
91
92
93 epilogue = paste(strwrap(c("Description:\n
94 This tool is designed to make memory efficient preprocessing of two fastq files. Input files can be in GNU zipped archive (.gz extension).
95 Reads are filtered based on the quality, presence of N bases and adapters.
96 Two input fastq files are procesed in parallel. Cutaddapt program must be installed to use adapter filtering.
97 Only complete pair are kept. As the input files are process in chunks,
98 it is required that pair reads are complete and in the same order in both input files. All reads which pass the quality
99 filter fill be writen into ouytput files. If --sample size is specified, only sample of sequences will be returned.
100 By default cutadapt us run with this options:
101 \n ",
102 cutadapt_arguments, "\n\nIf you want to use different adapter sequences specify them in separate file and use -C --cutadapt options.
103 Consult cutadapt manual for usage. ")),
104 collapse = "\n")
105
106 epilogue = paste(epilogue, "
107 fastqA fastqB
108 | |
109 V V
110 ----------------------------------
111 | Trimming (optional) |
112 ----------------------------------
113 | |
114 | |
115 V V
116 ----------------------------------
117 | Filter by quality |
118 ----------------------------------
119 | |
120 | |
121 V V
122 ---------------------------------
123 | Discard single reads |--->
124 | Keep complete pairs |
125 ---------------------------------
126 | |
127 V V
128 ----------------------------------
129 | cutadapt filtering |
130 ----------------------------------
131 | |
132 V V
133 ---------------------------------
134 | Discard single reads |--->
135 | Keep complete pairs |
136 ---------------------------------
137 | |
138 | |
139 V V
140 ----------------------------------
141 | sample (optional) |
142 ----------------------------------
143 | |
144 | |
145 V V
146 FastqX FastqY
147 (or fastaX) (or fastaY)
148
149 # example:
150
151 ",
152 sep = "\n")
153
154
155 parser = OptionParser(option_list = option_list, epilogue = epilogue, description = description,
156 )
157
158
159
160 opt = parse_args(parser, args = commandArgs(TRUE))
161 if (!(opt$format %in% c("fasta", "fastq"))) {
162 stop("wrong output format")
163 }
164 if (any(is.na(c(opt$fastqA, opt$fastqB, opt$fastqX, opt$fastqY)))) {
165 cat("\nInput ond output files must be specified!\n\n")
166 print_help(parser)
167 q()
168 }
169
170
171 if (!is.null(opt$cutadapt_options)) {
172 if (file.exists(opt$cutadapt_options)) {
173 cutadapt_arguments = scan(opt$cutadapt_options, what = "character", comment.char = "#")
174
175 } else {
176 cutadapt_arguments = opt$cutadapt_options
177 }
178 ## remove 'EOF'
179 cutadapt_arguments = paste(" ", cutadapt_arguments, " ", collapse = " ", sep = " ")
180 }
181
182
183
184 cutadapt_cmd = function(input, output, cutadapt_arguments) {
185 cmds = paste("cutadapt --format=", "fastq", cutadapt_arguments, " ", c(input),
186 " -o ", c(output), sep = "")
187 }
188
189 set.seed(opt$rnd_seed)
190 PROP = opt$percent_above/100
191 MINQ = opt$cut_off
192 TRIM_END = opt$trim_end
193 TRIM_START = opt$trim_start
194
195 CHUNK_SIZE = opt$chunk_size
196
197 MIN_LENGTH = 1 + TRIM_END - ifelse(is.na(TRIM_START), 1, TRIM_START)
198 if (is.na(MIN_LENGTH)) {
199 MIN_LENGTH = 1
200 }
201
202 suppressPackageStartupMessages(library(ShortRead))
203 Qfilter = srFilter(function(x) {
204 rowSums(as(quality(x), "matrix") > MINQ, na.rm = TRUE)/nchar(sread(x)) >= PROP
205 })
206
207 Min_Length_filter = srFilter(function(x) {
208 nchar(sread(x)) >= MIN_LENGTH
209 })
210
211
212
213
214 fin_filter = compose(Qfilter, nFilter(opt$Max_N), Min_Length_filter)
215
216 if (opt$format == "fasta") {
217 writeFun = writeFasta
218 } else {
219 writeFun = writeFastq
220 }
221
222 f1out = opt$fastqX
223 f2out = opt$fastqY
224 # are output files empty?:
225
226
227 # find wich character specify pair: - it could be
228 lA = readLines(opt$fastqA, n = 1)
229 lB = readLines(opt$fastqB, n = 1)
230 pair_tag_position = which(!strsplit(lA, "")[[1]] == strsplit(lB, "")[[1]])
231
232 if ((length(pair_tag_position)) != 1 & !opt$rename) {
233 warning("unable to determine pair tag")
234 }
235
236 # pair_regex=paste(substring(lA,pair_tag_position-1,pair_tag_position-1),'.+',sep='')
237 # assume 1/ same order 2/ all id must be same
238
239 # check size of sequences - must be same...
240 n1 = countLines(opt$fastqA)/4
241
242
243 number_of_chunks = round(n1/CHUNK_SIZE)
244 if (number_of_chunks == 0) {
245 CHUNK_SIZE = n1
246 number_of_chunks = 1
247 }
248 if (!is.null(opt$sample_size)) {
249 sample_size_in_chunk = opt$sample_size/number_of_chunks
250 } else {
251 sample_size_in_chunk = CHUNK_SIZE
252 }
253
254 # adjust the chunk size to get exact count of sequences:
255 CHUNK_SIZE = ceiling(n1/number_of_chunks)
256 F_id = ifelse(opt$rename, "/1", "1")
257 R_id = ifelse(opt$rename, "/2", "2")
258
259 f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE)
260 f2 <- FastqStreamer(opt$fastqB, CHUNK_SIZE)
261 total = 0
262 chunk = 0
263 nucleotideFrequenciesForward = nucleotideFrequenciesReverse = matrix(0)
264 while (TRUE) {
265 chunk = chunk + 1
266 fq1 <- yield(f1)
267 fq2 <- yield(f2)
268 if (length(fq1) == 0) {
269 break
270 }
271 ## rename
272 chunk_id = sprintf(paste0("%0", round(log10(number_of_chunks)) + 1, "d"), chunk)
273 cat("chunk id ", chunk_id, "\n")
274 fmt = paste0("%0", round(log10(length(fq1))) + 1, "d")
275
276 ## either do not rename
277 if (opt$rename) {
278 ## or use index
279 fq1@id = fq2@id = BStringSet(paste0(chunk_id, sprintf(fmt, 1:length(fq1))))
280 } else {
281 ## or use largest common substring
282 ind = mapply(function(x, y) which(x != y)[1], strsplit(as.character(id(fq1)),
283 split = ""), strsplit(as.character(id(fq2)), split = ""))
284 fq1@id = fq2@id = subseq(id(fq1), 1, ind - 1)
285
286 }
287
288 if (!is.na(TRIM_END)) {
289 fq1 = narrow(fq1, end = ifelse((TRIM_END) > width(fq1), width(fq1), TRIM_END))
290 fq2 = narrow(fq2, end = ifelse((TRIM_END) > width(fq2), width(fq2), TRIM_END))
291 }
292 if (!is.na(TRIM_START)) {
293 fq1 = narrow(fq1, start = ifelse((TRIM_START) < width(fq1), TRIM_START, width(fq1)))
294 fq2 = narrow(fq2, start = ifelse((TRIM_START) < width(fq2), TRIM_START, width(fq2)))
295
296 }
297
298
299 fqF1 = fq1[fin_filter(fq1)]
300 fqF2 = fq2[fin_filter(fq2)]
301
302 inc1 = id(fqF1) %in% id(fqF2)
303 inc2 = id(fqF2) %in% id(fqF1)
304
305 cat("running cutadapt on chunk\n")
306 # cut addapt here: # filter parts:
307 tmp_in1 = tempfile(fileext = ".fastq")
308 tmp_out1 = tempfile(fileext = ".fastq")
309 tmp_in2 = tempfile(fileext = ".fastq")
310 tmp_out2 = tempfile(fileext = ".fastq")
311
312
313 if (is.null(formals(writeFastq)$compress)) {
314 writeFastq(fqF1[inc1], file = tmp_in1)
315 writeFastq(fqF2[inc2], file = tmp_in2)
316 } else {
317 writeFastq(fqF1[inc1], file = tmp_in1, compress = FALSE)
318 writeFastq(fqF2[inc2], file = tmp_in2, compress = FALSE)
319 }
320
321
322 cmd1 = cutadapt_cmd(tmp_in1, tmp_out1, cutadapt_arguments)
323 cmd2 = cutadapt_cmd(tmp_in2, tmp_out2, cutadapt_arguments)
324 # this should run cutadapt in parallel
325
326 status = system(paste(cmd1, "& \n", cmd2, "&\nwait"), intern = TRUE)
327 # collect results
328 ftmp1 = FastqFile(tmp_out1)
329
330 fqF1 = readFastq(ftmp1)
331 close(ftmp1)
332
333 ftmp2 = FastqFile(tmp_out2)
334 fqF2 = readFastq(ftmp2)
335 close(ftmp2)
336 ## clean up
337 unlink(tmp_out1)
338 unlink(tmp_in1)
339 unlink(tmp_out2)
340 unlink(tmp_in2)
341
342
343
344 # remove sequences similar to filter database (e.g. plastid DNA)
345 if (!is.null(opt$filter_seq)){
346 blast_results1 = megablast(fqF1, database=opt$filter_seq)
347 blast_results2 = megablast(fqF2, database=opt$filter_seq)
348 if (!is.null(blast_results1) & !is.null(blast_results2)){
349 exclude1=with(blast_results1, unique(qseqid[pident >= 90 & qcovs >=90]))
350 exclude2=with(blast_results2, unique(qseqid[pident >= 90 & qcovs >=90]))
351 ## note - blast will truncate seq ids - anything beyond space is omitted
352 seq_to_exclude1 = gsub(" .*","",id(fqF1)) %in% exclude1
353 seq_to_exclude2 = gsub(" .*","",id(fqF2)) %in% exclude2
354 excluded_contamination1=signif(sum(seq_to_exclude1)/length(seq_to_exclude1)*100,3)
355 excluded_contamination2=signif(sum(seq_to_exclude2)/length(seq_to_exclude2)*100,3)
356 cat(excluded_contamination1,"% filtered out after blast - forward reads\n" )
357 cat(excluded_contamination2,"% filtered out after blast - reverse reads\n" )
358 fqF1 = fqF1[!seq_to_exclude1]
359 fqF2 = fqF2[!seq_to_exclude2]
360 }
361 }
362
363
364
365
366 # filter complete pairs again: id1=gsub(pair_regex,'',id(fqF1))
367 # id2=gsub(pair_regex,'',id(fqF2))
368 inc1 = id(fqF1) %in% id(fqF2)
369 inc2 = id(fqF2) %in% id(fqF1)
370 total = sum(inc1) + total
371 ## create new id - last character must differentiate pair - for interlacig
372
373 fqF1@id = BStringSet(paste0(id(fqF1), F_id))
374 fqF2@id = BStringSet(paste0(id(fqF2), R_id))
375
376
377 if (sum(inc1) > sample_size_in_chunk) {
378 smp = sort(sample(sum(inc1), sample_size_in_chunk))
379 writeFun(fqF1[inc1][smp], file = f1out, mode = "a")
380 writeFun(fqF2[inc2][smp], file = f2out, mode = "a")
381 nfrq1 = alphabetByCycle(sread(fqF1[inc1][smp]))
382 nfrq2 = alphabetByCycle(sread(fqF2[inc2][smp]))
383 } else {
384 writeFun(fqF1[inc1], file = f1out, mode = "a")
385 writeFun(fqF2[inc2], file = f2out, mode = "a")
386 nfrq1 = alphabetByCycle(sread(fqF1[inc1]))
387 nfrq2 = alphabetByCycle(sread(fqF2[inc2]))
388 }
389 nucleotideFrequenciesForward = matricesSum(nucleotideFrequenciesForward, nfrq1)
390 nucleotideFrequenciesReverse = matricesSum(nucleotideFrequenciesReverse, nfrq2)
391
392 }
393 if( !is.na(opt$png_file_output)){
394 png(opt$png_file_output, width = 1000, height = 1000)
395 par(mfrow=c(2,1))
396 plotFrequencies(nucleotideFrequenciesForward)
397 mtext("forward reads")
398 plotFrequencies(nucleotideFrequenciesReverse)
399 mtext("reverse reads")
400 dev.off()
401 }
402 close(f1)
403 close(f2)
404