comparison single_fastq_filtering.R @ 0:a4cd8608ef6b draft

Uploaded
author petr-novak
date Mon, 01 Apr 2019 07:56:36 -0400
parents
children 378565f5a875
comparison
equal deleted inserted replaced
-1:000000000000 0:a4cd8608ef6b
1 #!/usr/bin/env Rscript
2
3
4
5 library(optparse,quiet=TRUE)
6 matricesSum = function(m1,m2){
7 mfin = matrix(0, nrow = nrow(m2), ncol = max(ncol(m1), ncol(m2)))
8 rownames(mfin) = rownames(m2)
9 if (sum(m1)>0){
10 mfin[,1:ncol(m1)] = mfin[,1:ncol(m1)] + m1
11 }
12 if (sum(m2)>0){
13 mfin[,1:ncol(m2)] = mfin[,1:ncol(m2)] + m2
14 }
15
16 return(mfin)
17 }
18
19 plotFrequencies = function(x){
20 par(xaxs = 'i', yaxs = 'i')
21 plot(NULL, xlim = c(1,ncol(x)), ylim = c(0,1),type ='n', xlab="position", ylab = "proportion")
22 col = c(A="red", "T"="orange", C="blue", G="green")
23 for (n in c("A","C", "T","G")){
24 points(x[n,]/colSums(x), type = 'l', col = col[n])
25 }
26 grid(ny = 50, nx = ncol(x) - 1 )
27 legend("topright", col = col, lwd = 2, legend = names(col))
28 }
29
30 megablast = function(fx,params="-F \"m D\" -D 4 -p 90 ",database){
31 # assume blastn and makeblastdb in $PATH
32 tmp_in = tempfile()
33 tmp_out = tempfile()
34 writeFasta(fx,file=tmp_in)
35 ## check if db is present:
36 dbfiles = paste0(database, ".nhr")
37 if (!file.exists(dbfiles)){
38 cat("running makeblastdb\n")
39 system(paste0("makeblastdb -dbtype nucl -in ", database))
40 }
41 params = '-num_alignments 1 -num_threads 4 -outfmt "6 qseqid qcovs pident" '
42 cmd=paste("blastn", " -db", database, "-query", tmp_in,
43 "-out", tmp_out, " ", params)
44 status=system(cmd,intern=TRUE)
45
46 if (file.info(tmp_out)$size != 0){
47 results=read.table(tmp_out,sep="\t",as.is=TRUE,comment.char="")
48 colnames(results) = c("qseqid", "qcovs", "pident")
49 }else{
50 results=NULL
51 }
52 unlink(tmp_in)
53 unlink(tmp_out)
54 return(results)
55 }
56
57
58
59 option_list = list(
60 make_option(c('-a', '--fastqA'),action='store',type='character',help='fastq file A',default=NA),
61 make_option(c('-x', '--fastqX'),action='store',type='character',help='output fastq file X',default=NA),
62 make_option(c('-c', '--cut_off'),action='store',type='numeric',help='Quality cut-off value [default %default]',default=10),
63 make_option(c('-r', '--rnd_seed'),action='store',type='numeric',help='seed for random number generator [default %default]',default=123),
64 make_option(c('-p', '--percent_above'),action='store',type='numeric',
65 help='Percent of bases in sequence that must have quality equal to / higher than cut-off value [default %default]',default=95),
66 make_option(c("-G", "--png_file_output"), action = "store", type = "character", default=NA),
67 make_option(c('-e', '--trim_end'), action='store',type='numeric',help="trimming - end position [no trimming by default]", default=NA),
68 make_option(c('-s', '--trim_start'), action='store',type='numeric',help="triming position - start [no trimming by default]", default=NA),
69
70 make_option(c('-n', '--sample_size'),action='store',type='numeric',help='requested sample size (number of pairs)[no sampling by default]',default=NULL),
71 make_option(c('-N', '--Max_N'),action='store',type='integer',help='maximum number of Ns in sequences [default %default]',default=0),
72 make_option(c('-f', '--format'),action='store',type='character',help='format of output - fastq or fasta [default %default] ',default="fasta"),
73 make_option(c('-C', '--cutadapt_options'),action='store',type='character',help='file specifying cutadapt options',default=NULL),
74 make_option(c('-F', '--filter_seq'),action='store',type='character',help='file specifying sequences for filtering (e.g. plastid DNA)',default=NULL),
75 make_option(c('-j', '--chunk_size'),action='store',type='numeric',help='Number of sequences processed in single step. This option affect speed of processing and memory usage [default %default]',default=1000000)
76
77
78
79
80
81 )
82
83 description=paste(strwrap('Script for filterinq fastq file f
84 '),collapse="\n")
85 # arguments whic hcould be modifed
86 cutadapt_arguments=paste(
87 " --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' ",
88 " --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' ",
89 " --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' ",
90 " --anywhere='CAAGCAGAAGACGGCATACGAGAT' --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' ",
91 "--error-rate=0.05 --times=1 --overlap=15 --discard ",
92 " ",sep='')
93
94
95 epilogue=paste(strwrap(c('Description:\n
96 This tool is designed to make memory efficient preprocessing of single fastq files. Input files can be in GNU zipped archive (.gz extension).
97 Reads are filtered based on the quality, presence of N bases and adapters.
98 Cutaddapt program must be installed to use adapter filtering.
99 The input files are process in chunks,
100 All reads which pass the quality filter fill be writen into output file.
101 If --sample size is specified, only sample of sequences will be returned.
102 By default cutadapt us run with this options:
103 \n ',
104 cutadapt_arguments,
105 "\n\nIf you want to use different adapter sequences specify them in separate file and use -C --cutadapt options.
106 Consult cutadapt manual for usage. ")
107 ),collapse="\n")
108
109 epilogue=paste(epilogue,"
110
111
112 # example:
113 TODO
114
115 ",sep='\n')
116
117
118 parser=OptionParser(
119 option_list=option_list,
120 epilogue=epilogue,
121 description=description,
122 )
123
124
125
126 opt = parse_args(parser, args=commandArgs(TRUE))
127 if (!(opt$format %in% c("fasta","fastq"))){
128 stop("wrong output format")
129 }
130
131 if (any(is.na(c(opt$fastqA, opt$fastqX )))){
132 cat("\nInput ond output files must be specified!\n\n")
133 print_help(parser)
134 q()
135 }
136
137
138 if (!is.null(opt$cutadapt_options)){
139 cutadapt_argumenns=scan(opt$cutadapt_options,what="character", comment.char="#")
140 #remove "EOF"
141 cutadapt_arguments=paste(" ",cutadapt_arguments," ",collapse=' ',sep=" ")
142 }
143
144
145 cutadapt_cmd=function(input,output,cutadapt_arguments){
146 cmds=paste("cutadapt --format=",
147 "fastq",
148 cutadapt_arguments,
149 " ",
150 c(input),
151 " -o ",
152 c(output),
153 sep="")
154
155 }
156
157 set.seed(opt$rnd_seed)
158
159 PROP=opt$percent_above/100
160 MINQ=opt$cut_off
161 TRIM_END=opt$trim_end
162 TRIM_START=opt$trim_start
163
164 CHUNK_SIZE=opt$chunk_size
165
166 MIN_LENGTH=TRIM_END - ifelse(is.na(TRIM_START),1,TRIM_START)+1
167 if (is.na(MIN_LENGTH)){
168 MIN_LENGTH=1
169 }
170
171 suppressPackageStartupMessages(library(ShortRead))
172 Qfilter=srFilter(function(x){
173 rowSums(as(quality(x),'matrix')>MINQ,na.rm=TRUE)/nchar(sread(x))>=PROP
174 }
175 )
176
177 Min_Length_filter=srFilter(function(x){
178 nchar(sread(x))>=MIN_LENGTH
179 }
180 )
181
182
183 fin_filter=compose(Qfilter,nFilter(opt$Max_N),Min_Length_filter)
184
185 if (opt$format=="fasta"){
186 writeFun=writeFasta
187 }else{
188 writeFun=writeFastq
189 }
190
191
192 f1out=opt$fastqX
193
194 #find wich character specify pair: - it could be
195
196 # check size of sequences - must be same...
197 n1=countLines(opt$fastqA)/4
198
199 cat("number of sequences in ",opt$fastqA,":",n1,"\n")
200 cat("--------\n")
201
202
203 number_of_chunks=round(n1/CHUNK_SIZE)
204 if (number_of_chunks==0){
205 CHUNK_SIZE=n1
206 number_of_chunks=1
207 }
208 if (!is.null(opt$sample_size)){
209 sample_size_in_chunk=opt$sample_size/number_of_chunks
210 }else{
211 sample_size_in_chunk=CHUNK_SIZE
212 }
213
214 # adjust the chunk size to get exact count of sequences:
215 CHUNK_SIZE = ceiling(n1/number_of_chunks)
216 save.image("tmp.RData")
217
218 print("--------------------------------")
219 print (sample_size_in_chunk)
220 print (opt$sample_size)
221 print (CHUNK_SIZE)
222 print("--------------------------------")
223 f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE)
224 total=0
225 nucleotideFrequenciesForward = matrix(0)
226 while(TRUE){
227 print (Sys.time())
228 fq1 <- yield(f1)
229 print (Sys.time())
230 if (length(fq1)==0){
231 break
232 }
233
234
235
236
237 if (!is.na(TRIM_END)){
238 fq1=narrow(fq1,end=ifelse((TRIM_END)>width(fq1),width(fq1),TRIM_END))
239 }
240 if (!is.na(TRIM_START)){
241 fq1=narrow(fq1,start=ifelse((TRIM_START)<width(fq1),TRIM_START,width(fq1)))
242 }
243
244
245
246 fqF1=fq1[fin_filter(fq1)]
247
248 cat("running cutadapt on chunk\n")
249 # cut addapt here: # filter parts:
250 tmp_in1=tempfile(fileext=".fastq")
251 tmp_out1=tempfile(fileext=".fastq")
252 if (is.null(formals(writeFastq)$compress)){
253 writeFastq(fqF1,file=tmp_in1)
254 }else{
255 writeFastq(fqF1,file=tmp_in1, compress = FALSE)
256 }
257
258
259 cmd1=cutadapt_cmd(tmp_in1,tmp_out1, cutadapt_arguments)
260
261 status=system(cmd1,intern=TRUE)
262 print (Sys.time())
263 # collect results
264 ftmp1=FastqFile(tmp_out1)
265
266 fqF1=readFastq(ftmp1)
267 close(ftmp1)
268
269 # remove sequences similar to filter database (e.g. plastid DNA)
270 if (!is.null(opt$filter_seq)){
271 blast_results = megablast(fqF1, database=opt$filter_seq)
272 if (!is.null(blast_results)){
273 exclude=with(blast_results, unique(qseqid[pident >= 90 & qcovs >=90]))
274 ## note - blast will truncate seq ids - anything beyond space is omitted
275 seq_to_exclude = gsub(" .*","",id(fqF1)) %in% exclude
276 excluded_contamination=signif(sum(seq_to_exclude)/length(seq_to_exclude)*100,3)
277 cat(excluded_contamination,"% filtered out after blast\n" )
278 fqF1 = fqF1[!seq_to_exclude]
279 }
280 }
281
282
283
284 # clean up
285 unlink(tmp_out1)
286 unlink(tmp_in1)
287
288
289
290
291 # filter complete pairs again:
292
293 # create new id - last character must differentiate pair - for interlacig
294 if (length(fqF1)>sample_size_in_chunk){
295 smp=sort(sample(seq_along(fqF1),sample_size_in_chunk))
296 writeFun(fqF1[smp],file=f1out,mode='a')
297 nfrq1 = alphabetByCycle(sread(fqF1[smp]))
298
299 }else{
300 writeFun(fqF1,file=f1out,mode='a')
301 nfrq1 = alphabetByCycle(sread(fqF1))
302 }
303 nucleotideFrequenciesForward = matricesSum(nucleotideFrequenciesForward, nfrq1)
304
305 }
306
307 if( !is.na(opt$png_file_output)){
308 png(opt$png_file_output, width = 1000, height = 500)
309 plotFrequencies(nucleotideFrequenciesForward)
310 dev.off()
311 }
312
313 close(f1)
314
315
316
317