0
|
1 #!/usr/bin/env Rscript
|
|
2 # TODO - add triming from start and end, removing short reads
|
|
3 library(optparse, quiet = TRUE)
|
|
4 matricesSum = function(m1,m2){
|
|
5 mfin = matrix(0, nrow = nrow(m2), ncol = max(ncol(m1), ncol(m2)))
|
|
6 rownames(mfin) = rownames(m2)
|
|
7 if (sum(m1)>0){
|
|
8 mfin[,1:ncol(m1)] = mfin[,1:ncol(m1)] + m1
|
|
9 }
|
|
10 if (sum(m2)>0){
|
|
11 mfin[,1:ncol(m2)] = mfin[,1:ncol(m2)] + m2
|
|
12 }
|
|
13
|
|
14 return(mfin)
|
|
15 }
|
|
16 megablast = function(fx,params="-F \"m D\" -D 4 -p 90 ",database){
|
|
17 # assume blastn and makeblastdb in $PATH
|
|
18 tmp_in = tempfile()
|
|
19 tmp_out = tempfile()
|
|
20 writeFasta(fx,file=tmp_in)
|
|
21 ## check if db is present:
|
|
22 dbfiles = paste0(database, ".nhr")
|
|
23 if (!file.exists(dbfiles)){
|
|
24 cat("running makeblastdb\n")
|
|
25 system(paste0("makeblastdb -dbtype nucl -in ", database))
|
|
26 }
|
|
27 params = '-num_alignments 1 -num_threads 4 -outfmt "6 qseqid qcovs pident" '
|
|
28 cmd=paste("blastn", " -db", database, "-query", tmp_in,
|
|
29 "-out", tmp_out, " ", params)
|
|
30 status=system(cmd,intern=TRUE)
|
|
31 if (file.info(tmp_out)$size != 0){
|
|
32 results=read.table(tmp_out,sep="\t",as.is=TRUE,comment.char="")
|
|
33 colnames(results) = c("qseqid", "qcovs", "pident")
|
|
34 }else{
|
|
35 results=NULL
|
|
36 }
|
|
37 unlink(tmp_in)
|
|
38 unlink(tmp_out)
|
|
39 return(results)
|
|
40 }
|
|
41
|
|
42
|
|
43 plotFrequencies = function(x){
|
|
44 par(xaxs = 'i', yaxs = 'i')
|
|
45 plot(NULL, xlim = c(1,ncol(x)), ylim = c(0,1),type ='n', xlab="position", ylab = "proportion")
|
|
46 col = c(A="red", "T"="orange", C="blue", G="green")
|
|
47 for (n in c("A","C", "T","G")){
|
|
48 points(x[n,]/colSums(x), type = 'l', col = col[n])
|
|
49 }
|
|
50 grid(ny = 50, nx = ncol(x) - 1 )
|
|
51 legend("topright", col = col, lwd = 2, legend = names(col))
|
|
52 }
|
|
53
|
|
54 option_list = list(make_option(c("-a", "--fastqA"), action = "store", type = "character",
|
|
55 help = "fastq file A", default = NA), make_option(c("-b", "--fastqB"), action = "store",
|
|
56 type = "character", help = "fastq file B", default = NA), make_option(c("-x",
|
|
57 "--fastqX"), action = "store", type = "character", help = "output fastq file X",
|
|
58 default = NA), make_option(c("-y", "--fastqY"), action = "store", type = "character",
|
|
59 help = "output fastq file Y", default = NA), make_option(c("-c", "--cut_off"),
|
|
60 action = "store", type = "numeric", help = "Quality cut-off value [default %default]",
|
|
61 default = 10), make_option(c("-r", "--rnd_seed"), action = "store", type = "numeric",
|
|
62 help = "seed for random number generator [default %default]", default = 123),
|
|
63 make_option(c("-G", "--png_file_output"), action = "store", type = "character", default=NA),
|
|
64 make_option(c("-p", "--percent_above"), action = "store", type = "numeric", help = "Percent of bases in sequence that must have quality equal to / higher than cut-off value [default %default]",
|
|
65 default = 95), make_option(c("-e", "--trim_end"), action = "store", type = "numeric",
|
|
66 help = "trimming - end position [no trimming by default]", default = NA),
|
|
67 make_option(c("-s", "--trim_start"), action = "store", type = "numeric", help = "triming position - start [no trimming by default]",
|
|
68 default = NA), make_option(c("-n", "--sample_size"), action = "store", type = "numeric",
|
|
69 help = "requested sample size (number of pairs)[no sampling by default]",
|
|
70 default = NULL), make_option(c("-N", "--Max_N"), action = "store", type = "integer",
|
|
71 help = "maximum number of Ns in sequences [default %default]", default = 0),
|
|
72 make_option(c("-R", "--rename"), action = "store_true", type = "logical", help = "Rename sequences",
|
|
73 default = FALSE), make_option(c("-f", "--format"), action = "store", type = "character",
|
|
74 help = "format of output - fastq or fasta [default %default] ", default = "fasta"),
|
|
75 make_option(c("-C", "--cutadapt_options"), action = "store", type = "character",
|
|
76 help = "file specifying cutadapt options", default = NULL), make_option(c("-j",
|
|
77 "--chunk_size"), action = "store", type = "numeric", help = "Number of sequences processed in single step. This option affect speed of processing and memory usage [default %default]",
|
5
|
78 default = 1000000),
|
0
|
79 make_option(c('-F', '--filter_seq'),action='store',type='character',help='file specifying sequences for filtering (e.g. plastid DNA)',default=NULL)
|
|
80 )
|
|
81
|
|
82 description = paste(strwrap("Script for filterinq fastq files generated from paired end sequencing
|
|
83 \t\t\t\t\t\t"),
|
|
84 collapse = "\n")
|
|
85 # arguments whic hcould be modifed
|
|
86 cutadapt_arguments = paste(" --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' ",
|
|
87 " --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' ",
|
|
88 " --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' ",
|
|
89 " --anywhere='CAAGCAGAAGACGGCATACGAGAT' --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' ",
|
|
90 "--error-rate=0.05 --times=1 --overlap=15 --discard ", " ", sep = "")
|
|
91
|
|
92
|
|
93 epilogue = paste(strwrap(c("Description:\n
|
|
94 This tool is designed to make memory efficient preprocessing of two fastq files. Input files can be in GNU zipped archive (.gz extension).
|
|
95 Reads are filtered based on the quality, presence of N bases and adapters.
|
|
96 Two input fastq files are procesed in parallel. Cutaddapt program must be installed to use adapter filtering.
|
|
97 Only complete pair are kept. As the input files are process in chunks,
|
|
98 it is required that pair reads are complete and in the same order in both input files. All reads which pass the quality
|
|
99 filter fill be writen into ouytput files. If --sample size is specified, only sample of sequences will be returned.
|
|
100 By default cutadapt us run with this options:
|
|
101 \n ",
|
|
102 cutadapt_arguments, "\n\nIf you want to use different adapter sequences specify them in separate file and use -C --cutadapt options.
|
|
103 Consult cutadapt manual for usage. ")),
|
|
104 collapse = "\n")
|
|
105
|
|
106 epilogue = paste(epilogue, "
|
|
107 fastqA fastqB
|
|
108 | |
|
|
109 V V
|
|
110 ----------------------------------
|
|
111 | Trimming (optional) |
|
|
112 ----------------------------------
|
|
113 | |
|
|
114 | |
|
|
115 V V
|
|
116 ----------------------------------
|
|
117 | Filter by quality |
|
|
118 ----------------------------------
|
|
119 | |
|
|
120 | |
|
|
121 V V
|
|
122 ---------------------------------
|
|
123 | Discard single reads |--->
|
|
124 | Keep complete pairs |
|
|
125 ---------------------------------
|
|
126 | |
|
|
127 V V
|
|
128 ----------------------------------
|
|
129 | cutadapt filtering |
|
|
130 ----------------------------------
|
|
131 | |
|
|
132 V V
|
|
133 ---------------------------------
|
|
134 | Discard single reads |--->
|
|
135 | Keep complete pairs |
|
|
136 ---------------------------------
|
|
137 | |
|
|
138 | |
|
|
139 V V
|
|
140 ----------------------------------
|
|
141 | sample (optional) |
|
|
142 ----------------------------------
|
|
143 | |
|
|
144 | |
|
|
145 V V
|
|
146 FastqX FastqY
|
|
147 (or fastaX) (or fastaY)
|
|
148
|
|
149 # example:
|
|
150
|
|
151 ",
|
|
152 sep = "\n")
|
|
153
|
|
154
|
|
155 parser = OptionParser(option_list = option_list, epilogue = epilogue, description = description,
|
|
156 )
|
|
157
|
|
158
|
|
159
|
|
160 opt = parse_args(parser, args = commandArgs(TRUE))
|
|
161 if (!(opt$format %in% c("fasta", "fastq"))) {
|
|
162 stop("wrong output format")
|
|
163 }
|
|
164 if (any(is.na(c(opt$fastqA, opt$fastqB, opt$fastqX, opt$fastqY)))) {
|
|
165 cat("\nInput ond output files must be specified!\n\n")
|
|
166 print_help(parser)
|
|
167 q()
|
|
168 }
|
|
169
|
|
170
|
|
171 if (!is.null(opt$cutadapt_options)) {
|
|
172 if (file.exists(opt$cutadapt_options)) {
|
|
173 cutadapt_arguments = scan(opt$cutadapt_options, what = "character", comment.char = "#")
|
|
174
|
|
175 } else {
|
|
176 cutadapt_arguments = opt$cutadapt_options
|
|
177 }
|
|
178 ## remove 'EOF'
|
|
179 cutadapt_arguments = paste(" ", cutadapt_arguments, " ", collapse = " ", sep = " ")
|
|
180 }
|
|
181
|
|
182
|
|
183
|
|
184 cutadapt_cmd = function(input, output, cutadapt_arguments) {
|
|
185 cmds = paste("cutadapt --format=", "fastq", cutadapt_arguments, " ", c(input),
|
|
186 " -o ", c(output), sep = "")
|
|
187 }
|
|
188
|
|
189 set.seed(opt$rnd_seed)
|
|
190 PROP = opt$percent_above/100
|
|
191 MINQ = opt$cut_off
|
|
192 TRIM_END = opt$trim_end
|
|
193 TRIM_START = opt$trim_start
|
|
194
|
|
195 CHUNK_SIZE = opt$chunk_size
|
|
196
|
|
197 MIN_LENGTH = 1 + TRIM_END - ifelse(is.na(TRIM_START), 1, TRIM_START)
|
|
198 if (is.na(MIN_LENGTH)) {
|
|
199 MIN_LENGTH = 1
|
|
200 }
|
|
201
|
|
202 suppressPackageStartupMessages(library(ShortRead))
|
|
203 Qfilter = srFilter(function(x) {
|
|
204 rowSums(as(quality(x), "matrix") > MINQ, na.rm = TRUE)/nchar(sread(x)) >= PROP
|
|
205 })
|
|
206
|
|
207 Min_Length_filter = srFilter(function(x) {
|
|
208 nchar(sread(x)) >= MIN_LENGTH
|
|
209 })
|
|
210
|
|
211
|
|
212
|
|
213
|
|
214 fin_filter = compose(Qfilter, nFilter(opt$Max_N), Min_Length_filter)
|
|
215
|
|
216 if (opt$format == "fasta") {
|
|
217 writeFun = writeFasta
|
|
218 } else {
|
|
219 writeFun = writeFastq
|
|
220 }
|
|
221
|
|
222 f1out = opt$fastqX
|
|
223 f2out = opt$fastqY
|
|
224 # are output files empty?:
|
|
225
|
|
226
|
|
227 # find wich character specify pair: - it could be
|
|
228 lA = readLines(opt$fastqA, n = 1)
|
|
229 lB = readLines(opt$fastqB, n = 1)
|
|
230 pair_tag_position = which(!strsplit(lA, "")[[1]] == strsplit(lB, "")[[1]])
|
|
231
|
|
232 if ((length(pair_tag_position)) != 1 & !opt$rename) {
|
|
233 warning("unable to determine pair tag")
|
|
234 }
|
|
235
|
|
236 # pair_regex=paste(substring(lA,pair_tag_position-1,pair_tag_position-1),'.+',sep='')
|
|
237 # assume 1/ same order 2/ all id must be same
|
|
238
|
|
239 # check size of sequences - must be same...
|
|
240 n1 = countLines(opt$fastqA)/4
|
|
241
|
|
242
|
|
243 number_of_chunks = round(n1/CHUNK_SIZE)
|
5
|
244 ## adjust chunk size to make last chunk of the same size is all other
|
|
245 ## this is to avoid small last chunk
|
|
246 CHUNK_SIZE = round(n1/number_of_chunks)
|
|
247
|
0
|
248 if (number_of_chunks == 0) {
|
|
249 CHUNK_SIZE = n1
|
|
250 number_of_chunks = 1
|
|
251 }
|
|
252 if (!is.null(opt$sample_size)) {
|
5
|
253 sample_size_in_chunk = round(opt$sample_size/number_of_chunks)
|
|
254 n_missing = opt$sample_size - sample_size_in_chunk * number_of_chunks
|
0
|
255 } else {
|
5
|
256 sample_size_in_chunk = CHUNK_SIZE
|
|
257 n_missing = 0
|
0
|
258 }
|
|
259
|
5
|
260 cat("number chunks ", number_of_chunks, "\n")
|
|
261 cat("chunks size ", CHUNK_SIZE, "\n")
|
0
|
262 # adjust the chunk size to get exact count of sequences:
|
|
263 CHUNK_SIZE = ceiling(n1/number_of_chunks)
|
|
264 F_id = ifelse(opt$rename, "/1", "1")
|
|
265 R_id = ifelse(opt$rename, "/2", "2")
|
|
266
|
|
267 f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE)
|
|
268 f2 <- FastqStreamer(opt$fastqB, CHUNK_SIZE)
|
|
269 total = 0
|
|
270 chunk = 0
|
|
271 nucleotideFrequenciesForward = nucleotideFrequenciesReverse = matrix(0)
|
|
272 while (TRUE) {
|
|
273 chunk = chunk + 1
|
5
|
274 cat("chunk number ", chunk, "\n")
|
0
|
275 fq1 <- yield(f1)
|
|
276 fq2 <- yield(f2)
|
|
277 if (length(fq1) == 0) {
|
|
278 break
|
|
279 }
|
5
|
280 cat("chunk number ", chunk, " imported\n")
|
|
281 cat("chunk size", length(fq1), "\n")
|
0
|
282 ## rename
|
|
283 chunk_id = sprintf(paste0("%0", round(log10(number_of_chunks)) + 1, "d"), chunk)
|
|
284 cat("chunk id ", chunk_id, "\n")
|
|
285 fmt = paste0("%0", round(log10(length(fq1))) + 1, "d")
|
|
286
|
|
287 ## either do not rename
|
|
288 if (opt$rename) {
|
|
289 ## or use index
|
|
290 fq1@id = fq2@id = BStringSet(paste0(chunk_id, sprintf(fmt, 1:length(fq1))))
|
|
291 } else {
|
|
292 ## or use largest common substring
|
|
293 ind = mapply(function(x, y) which(x != y)[1], strsplit(as.character(id(fq1)),
|
|
294 split = ""), strsplit(as.character(id(fq2)), split = ""))
|
|
295 fq1@id = fq2@id = subseq(id(fq1), 1, ind - 1)
|
|
296
|
|
297 }
|
|
298
|
|
299 if (!is.na(TRIM_END)) {
|
|
300 fq1 = narrow(fq1, end = ifelse((TRIM_END) > width(fq1), width(fq1), TRIM_END))
|
|
301 fq2 = narrow(fq2, end = ifelse((TRIM_END) > width(fq2), width(fq2), TRIM_END))
|
|
302 }
|
|
303 if (!is.na(TRIM_START)) {
|
|
304 fq1 = narrow(fq1, start = ifelse((TRIM_START) < width(fq1), TRIM_START, width(fq1)))
|
|
305 fq2 = narrow(fq2, start = ifelse((TRIM_START) < width(fq2), TRIM_START, width(fq2)))
|
|
306
|
|
307 }
|
|
308
|
|
309
|
|
310 fqF1 = fq1[fin_filter(fq1)]
|
|
311 fqF2 = fq2[fin_filter(fq2)]
|
|
312
|
|
313 inc1 = id(fqF1) %in% id(fqF2)
|
|
314 inc2 = id(fqF2) %in% id(fqF1)
|
|
315
|
|
316 cat("running cutadapt on chunk\n")
|
|
317 # cut addapt here: # filter parts:
|
|
318 tmp_in1 = tempfile(fileext = ".fastq")
|
|
319 tmp_out1 = tempfile(fileext = ".fastq")
|
|
320 tmp_in2 = tempfile(fileext = ".fastq")
|
|
321 tmp_out2 = tempfile(fileext = ".fastq")
|
|
322
|
|
323
|
|
324 if (is.null(formals(writeFastq)$compress)) {
|
|
325 writeFastq(fqF1[inc1], file = tmp_in1)
|
|
326 writeFastq(fqF2[inc2], file = tmp_in2)
|
|
327 } else {
|
|
328 writeFastq(fqF1[inc1], file = tmp_in1, compress = FALSE)
|
|
329 writeFastq(fqF2[inc2], file = tmp_in2, compress = FALSE)
|
|
330 }
|
|
331
|
|
332
|
|
333 cmd1 = cutadapt_cmd(tmp_in1, tmp_out1, cutadapt_arguments)
|
|
334 cmd2 = cutadapt_cmd(tmp_in2, tmp_out2, cutadapt_arguments)
|
|
335 # this should run cutadapt in parallel
|
|
336
|
|
337 status = system(paste(cmd1, "& \n", cmd2, "&\nwait"), intern = TRUE)
|
|
338 # collect results
|
|
339 ftmp1 = FastqFile(tmp_out1)
|
|
340
|
|
341 fqF1 = readFastq(ftmp1)
|
|
342 close(ftmp1)
|
|
343
|
|
344 ftmp2 = FastqFile(tmp_out2)
|
|
345 fqF2 = readFastq(ftmp2)
|
|
346 close(ftmp2)
|
|
347 ## clean up
|
|
348 unlink(tmp_out1)
|
|
349 unlink(tmp_in1)
|
|
350 unlink(tmp_out2)
|
|
351 unlink(tmp_in2)
|
|
352
|
|
353
|
|
354
|
5
|
355 ## remove sequences similar to filter database (e.g. plastid DNA)
|
0
|
356 if (!is.null(opt$filter_seq)){
|
|
357 blast_results1 = megablast(fqF1, database=opt$filter_seq)
|
|
358 blast_results2 = megablast(fqF2, database=opt$filter_seq)
|
|
359 if (!is.null(blast_results1) & !is.null(blast_results2)){
|
|
360 exclude1=with(blast_results1, unique(qseqid[pident >= 90 & qcovs >=90]))
|
|
361 exclude2=with(blast_results2, unique(qseqid[pident >= 90 & qcovs >=90]))
|
|
362 ## note - blast will truncate seq ids - anything beyond space is omitted
|
|
363 seq_to_exclude1 = gsub(" .*","",id(fqF1)) %in% exclude1
|
|
364 seq_to_exclude2 = gsub(" .*","",id(fqF2)) %in% exclude2
|
|
365 excluded_contamination1=signif(sum(seq_to_exclude1)/length(seq_to_exclude1)*100,3)
|
|
366 excluded_contamination2=signif(sum(seq_to_exclude2)/length(seq_to_exclude2)*100,3)
|
|
367 cat(excluded_contamination1,"% filtered out after blast - forward reads\n" )
|
|
368 cat(excluded_contamination2,"% filtered out after blast - reverse reads\n" )
|
|
369 fqF1 = fqF1[!seq_to_exclude1]
|
|
370 fqF2 = fqF2[!seq_to_exclude2]
|
|
371 }
|
|
372 }
|
|
373
|
|
374
|
|
375
|
|
376
|
|
377 # filter complete pairs again: id1=gsub(pair_regex,'',id(fqF1))
|
|
378 # id2=gsub(pair_regex,'',id(fqF2))
|
|
379 inc1 = id(fqF1) %in% id(fqF2)
|
|
380 inc2 = id(fqF2) %in% id(fqF1)
|
|
381 total = sum(inc1) + total
|
|
382 ## create new id - last character must differentiate pair - for interlacig
|
|
383
|
|
384 fqF1@id = BStringSet(paste0(id(fqF1), F_id))
|
|
385 fqF2@id = BStringSet(paste0(id(fqF2), R_id))
|
|
386
|
|
387
|
5
|
388 if (sum(inc1) > (sample_size_in_chunk + n_missing)) {
|
|
389 smp = sort(sample(sum(inc1), sample_size_in_chunk + n_missing))
|
|
390 n_missing = 0 ## this was to correct rounding error
|
0
|
391 writeFun(fqF1[inc1][smp], file = f1out, mode = "a")
|
|
392 writeFun(fqF2[inc2][smp], file = f2out, mode = "a")
|
|
393 nfrq1 = alphabetByCycle(sread(fqF1[inc1][smp]))
|
|
394 nfrq2 = alphabetByCycle(sread(fqF2[inc2][smp]))
|
|
395 } else {
|
|
396 writeFun(fqF1[inc1], file = f1out, mode = "a")
|
|
397 writeFun(fqF2[inc2], file = f2out, mode = "a")
|
|
398 nfrq1 = alphabetByCycle(sread(fqF1[inc1]))
|
|
399 nfrq2 = alphabetByCycle(sread(fqF2[inc2]))
|
|
400 }
|
|
401 nucleotideFrequenciesForward = matricesSum(nucleotideFrequenciesForward, nfrq1)
|
|
402 nucleotideFrequenciesReverse = matricesSum(nucleotideFrequenciesReverse, nfrq2)
|
|
403
|
|
404 }
|
|
405 if( !is.na(opt$png_file_output)){
|
|
406 png(opt$png_file_output, width = 1000, height = 1000)
|
|
407 par(mfrow=c(2,1))
|
|
408 plotFrequencies(nucleotideFrequenciesForward)
|
|
409 mtext("forward reads")
|
|
410 plotFrequencies(nucleotideFrequenciesReverse)
|
|
411 mtext("reverse reads")
|
|
412 dev.off()
|
|
413 }
|
|
414 close(f1)
|
|
415 close(f2)
|
|
416
|