Mercurial > repos > petr-novak > re_utils
comparison single_fastq_filtering.R @ 0:a4cd8608ef6b draft
Uploaded
author | petr-novak |
---|---|
date | Mon, 01 Apr 2019 07:56:36 -0400 |
parents | |
children | 378565f5a875 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a4cd8608ef6b |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 | |
4 | |
5 library(optparse,quiet=TRUE) | |
6 matricesSum = function(m1,m2){ | |
7 mfin = matrix(0, nrow = nrow(m2), ncol = max(ncol(m1), ncol(m2))) | |
8 rownames(mfin) = rownames(m2) | |
9 if (sum(m1)>0){ | |
10 mfin[,1:ncol(m1)] = mfin[,1:ncol(m1)] + m1 | |
11 } | |
12 if (sum(m2)>0){ | |
13 mfin[,1:ncol(m2)] = mfin[,1:ncol(m2)] + m2 | |
14 } | |
15 | |
16 return(mfin) | |
17 } | |
18 | |
19 plotFrequencies = function(x){ | |
20 par(xaxs = 'i', yaxs = 'i') | |
21 plot(NULL, xlim = c(1,ncol(x)), ylim = c(0,1),type ='n', xlab="position", ylab = "proportion") | |
22 col = c(A="red", "T"="orange", C="blue", G="green") | |
23 for (n in c("A","C", "T","G")){ | |
24 points(x[n,]/colSums(x), type = 'l', col = col[n]) | |
25 } | |
26 grid(ny = 50, nx = ncol(x) - 1 ) | |
27 legend("topright", col = col, lwd = 2, legend = names(col)) | |
28 } | |
29 | |
30 megablast = function(fx,params="-F \"m D\" -D 4 -p 90 ",database){ | |
31 # assume blastn and makeblastdb in $PATH | |
32 tmp_in = tempfile() | |
33 tmp_out = tempfile() | |
34 writeFasta(fx,file=tmp_in) | |
35 ## check if db is present: | |
36 dbfiles = paste0(database, ".nhr") | |
37 if (!file.exists(dbfiles)){ | |
38 cat("running makeblastdb\n") | |
39 system(paste0("makeblastdb -dbtype nucl -in ", database)) | |
40 } | |
41 params = '-num_alignments 1 -num_threads 4 -outfmt "6 qseqid qcovs pident" ' | |
42 cmd=paste("blastn", " -db", database, "-query", tmp_in, | |
43 "-out", tmp_out, " ", params) | |
44 status=system(cmd,intern=TRUE) | |
45 | |
46 if (file.info(tmp_out)$size != 0){ | |
47 results=read.table(tmp_out,sep="\t",as.is=TRUE,comment.char="") | |
48 colnames(results) = c("qseqid", "qcovs", "pident") | |
49 }else{ | |
50 results=NULL | |
51 } | |
52 unlink(tmp_in) | |
53 unlink(tmp_out) | |
54 return(results) | |
55 } | |
56 | |
57 | |
58 | |
59 option_list = list( | |
60 make_option(c('-a', '--fastqA'),action='store',type='character',help='fastq file A',default=NA), | |
61 make_option(c('-x', '--fastqX'),action='store',type='character',help='output fastq file X',default=NA), | |
62 make_option(c('-c', '--cut_off'),action='store',type='numeric',help='Quality cut-off value [default %default]',default=10), | |
63 make_option(c('-r', '--rnd_seed'),action='store',type='numeric',help='seed for random number generator [default %default]',default=123), | |
64 make_option(c('-p', '--percent_above'),action='store',type='numeric', | |
65 help='Percent of bases in sequence that must have quality equal to / higher than cut-off value [default %default]',default=95), | |
66 make_option(c("-G", "--png_file_output"), action = "store", type = "character", default=NA), | |
67 make_option(c('-e', '--trim_end'), action='store',type='numeric',help="trimming - end position [no trimming by default]", default=NA), | |
68 make_option(c('-s', '--trim_start'), action='store',type='numeric',help="triming position - start [no trimming by default]", default=NA), | |
69 | |
70 make_option(c('-n', '--sample_size'),action='store',type='numeric',help='requested sample size (number of pairs)[no sampling by default]',default=NULL), | |
71 make_option(c('-N', '--Max_N'),action='store',type='integer',help='maximum number of Ns in sequences [default %default]',default=0), | |
72 make_option(c('-f', '--format'),action='store',type='character',help='format of output - fastq or fasta [default %default] ',default="fasta"), | |
73 make_option(c('-C', '--cutadapt_options'),action='store',type='character',help='file specifying cutadapt options',default=NULL), | |
74 make_option(c('-F', '--filter_seq'),action='store',type='character',help='file specifying sequences for filtering (e.g. plastid DNA)',default=NULL), | |
75 make_option(c('-j', '--chunk_size'),action='store',type='numeric',help='Number of sequences processed in single step. This option affect speed of processing and memory usage [default %default]',default=1000000) | |
76 | |
77 | |
78 | |
79 | |
80 | |
81 ) | |
82 | |
83 description=paste(strwrap('Script for filterinq fastq file f | |
84 '),collapse="\n") | |
85 # arguments whic hcould be modifed | |
86 cutadapt_arguments=paste( | |
87 " --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' ", | |
88 " --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' ", | |
89 " --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' ", | |
90 " --anywhere='CAAGCAGAAGACGGCATACGAGAT' --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' ", | |
91 "--error-rate=0.05 --times=1 --overlap=15 --discard ", | |
92 " ",sep='') | |
93 | |
94 | |
95 epilogue=paste(strwrap(c('Description:\n | |
96 This tool is designed to make memory efficient preprocessing of single fastq files. Input files can be in GNU zipped archive (.gz extension). | |
97 Reads are filtered based on the quality, presence of N bases and adapters. | |
98 Cutaddapt program must be installed to use adapter filtering. | |
99 The input files are process in chunks, | |
100 All reads which pass the quality filter fill be writen into output file. | |
101 If --sample size is specified, only sample of sequences will be returned. | |
102 By default cutadapt us run with this options: | |
103 \n ', | |
104 cutadapt_arguments, | |
105 "\n\nIf you want to use different adapter sequences specify them in separate file and use -C --cutadapt options. | |
106 Consult cutadapt manual for usage. ") | |
107 ),collapse="\n") | |
108 | |
109 epilogue=paste(epilogue," | |
110 | |
111 | |
112 # example: | |
113 TODO | |
114 | |
115 ",sep='\n') | |
116 | |
117 | |
118 parser=OptionParser( | |
119 option_list=option_list, | |
120 epilogue=epilogue, | |
121 description=description, | |
122 ) | |
123 | |
124 | |
125 | |
126 opt = parse_args(parser, args=commandArgs(TRUE)) | |
127 if (!(opt$format %in% c("fasta","fastq"))){ | |
128 stop("wrong output format") | |
129 } | |
130 | |
131 if (any(is.na(c(opt$fastqA, opt$fastqX )))){ | |
132 cat("\nInput ond output files must be specified!\n\n") | |
133 print_help(parser) | |
134 q() | |
135 } | |
136 | |
137 | |
138 if (!is.null(opt$cutadapt_options)){ | |
139 cutadapt_argumenns=scan(opt$cutadapt_options,what="character", comment.char="#") | |
140 #remove "EOF" | |
141 cutadapt_arguments=paste(" ",cutadapt_arguments," ",collapse=' ',sep=" ") | |
142 } | |
143 | |
144 | |
145 cutadapt_cmd=function(input,output,cutadapt_arguments){ | |
146 cmds=paste("cutadapt --format=", | |
147 "fastq", | |
148 cutadapt_arguments, | |
149 " ", | |
150 c(input), | |
151 " -o ", | |
152 c(output), | |
153 sep="") | |
154 | |
155 } | |
156 | |
157 set.seed(opt$rnd_seed) | |
158 | |
159 PROP=opt$percent_above/100 | |
160 MINQ=opt$cut_off | |
161 TRIM_END=opt$trim_end | |
162 TRIM_START=opt$trim_start | |
163 | |
164 CHUNK_SIZE=opt$chunk_size | |
165 | |
166 MIN_LENGTH=TRIM_END - ifelse(is.na(TRIM_START),1,TRIM_START)+1 | |
167 if (is.na(MIN_LENGTH)){ | |
168 MIN_LENGTH=1 | |
169 } | |
170 | |
171 suppressPackageStartupMessages(library(ShortRead)) | |
172 Qfilter=srFilter(function(x){ | |
173 rowSums(as(quality(x),'matrix')>MINQ,na.rm=TRUE)/nchar(sread(x))>=PROP | |
174 } | |
175 ) | |
176 | |
177 Min_Length_filter=srFilter(function(x){ | |
178 nchar(sread(x))>=MIN_LENGTH | |
179 } | |
180 ) | |
181 | |
182 | |
183 fin_filter=compose(Qfilter,nFilter(opt$Max_N),Min_Length_filter) | |
184 | |
185 if (opt$format=="fasta"){ | |
186 writeFun=writeFasta | |
187 }else{ | |
188 writeFun=writeFastq | |
189 } | |
190 | |
191 | |
192 f1out=opt$fastqX | |
193 | |
194 #find wich character specify pair: - it could be | |
195 | |
196 # check size of sequences - must be same... | |
197 n1=countLines(opt$fastqA)/4 | |
198 | |
199 cat("number of sequences in ",opt$fastqA,":",n1,"\n") | |
200 cat("--------\n") | |
201 | |
202 | |
203 number_of_chunks=round(n1/CHUNK_SIZE) | |
204 if (number_of_chunks==0){ | |
205 CHUNK_SIZE=n1 | |
206 number_of_chunks=1 | |
207 } | |
208 if (!is.null(opt$sample_size)){ | |
209 sample_size_in_chunk=opt$sample_size/number_of_chunks | |
210 }else{ | |
211 sample_size_in_chunk=CHUNK_SIZE | |
212 } | |
213 | |
214 # adjust the chunk size to get exact count of sequences: | |
215 CHUNK_SIZE = ceiling(n1/number_of_chunks) | |
216 save.image("tmp.RData") | |
217 | |
218 print("--------------------------------") | |
219 print (sample_size_in_chunk) | |
220 print (opt$sample_size) | |
221 print (CHUNK_SIZE) | |
222 print("--------------------------------") | |
223 f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE) | |
224 total=0 | |
225 nucleotideFrequenciesForward = matrix(0) | |
226 while(TRUE){ | |
227 print (Sys.time()) | |
228 fq1 <- yield(f1) | |
229 print (Sys.time()) | |
230 if (length(fq1)==0){ | |
231 break | |
232 } | |
233 | |
234 | |
235 | |
236 | |
237 if (!is.na(TRIM_END)){ | |
238 fq1=narrow(fq1,end=ifelse((TRIM_END)>width(fq1),width(fq1),TRIM_END)) | |
239 } | |
240 if (!is.na(TRIM_START)){ | |
241 fq1=narrow(fq1,start=ifelse((TRIM_START)<width(fq1),TRIM_START,width(fq1))) | |
242 } | |
243 | |
244 | |
245 | |
246 fqF1=fq1[fin_filter(fq1)] | |
247 | |
248 cat("running cutadapt on chunk\n") | |
249 # cut addapt here: # filter parts: | |
250 tmp_in1=tempfile(fileext=".fastq") | |
251 tmp_out1=tempfile(fileext=".fastq") | |
252 if (is.null(formals(writeFastq)$compress)){ | |
253 writeFastq(fqF1,file=tmp_in1) | |
254 }else{ | |
255 writeFastq(fqF1,file=tmp_in1, compress = FALSE) | |
256 } | |
257 | |
258 | |
259 cmd1=cutadapt_cmd(tmp_in1,tmp_out1, cutadapt_arguments) | |
260 | |
261 status=system(cmd1,intern=TRUE) | |
262 print (Sys.time()) | |
263 # collect results | |
264 ftmp1=FastqFile(tmp_out1) | |
265 | |
266 fqF1=readFastq(ftmp1) | |
267 close(ftmp1) | |
268 | |
269 # remove sequences similar to filter database (e.g. plastid DNA) | |
270 if (!is.null(opt$filter_seq)){ | |
271 blast_results = megablast(fqF1, database=opt$filter_seq) | |
272 if (!is.null(blast_results)){ | |
273 exclude=with(blast_results, unique(qseqid[pident >= 90 & qcovs >=90])) | |
274 ## note - blast will truncate seq ids - anything beyond space is omitted | |
275 seq_to_exclude = gsub(" .*","",id(fqF1)) %in% exclude | |
276 excluded_contamination=signif(sum(seq_to_exclude)/length(seq_to_exclude)*100,3) | |
277 cat(excluded_contamination,"% filtered out after blast\n" ) | |
278 fqF1 = fqF1[!seq_to_exclude] | |
279 } | |
280 } | |
281 | |
282 | |
283 | |
284 # clean up | |
285 unlink(tmp_out1) | |
286 unlink(tmp_in1) | |
287 | |
288 | |
289 | |
290 | |
291 # filter complete pairs again: | |
292 | |
293 # create new id - last character must differentiate pair - for interlacig | |
294 if (length(fqF1)>sample_size_in_chunk){ | |
295 smp=sort(sample(seq_along(fqF1),sample_size_in_chunk)) | |
296 writeFun(fqF1[smp],file=f1out,mode='a') | |
297 nfrq1 = alphabetByCycle(sread(fqF1[smp])) | |
298 | |
299 }else{ | |
300 writeFun(fqF1,file=f1out,mode='a') | |
301 nfrq1 = alphabetByCycle(sread(fqF1)) | |
302 } | |
303 nucleotideFrequenciesForward = matricesSum(nucleotideFrequenciesForward, nfrq1) | |
304 | |
305 } | |
306 | |
307 if( !is.na(opt$png_file_output)){ | |
308 png(opt$png_file_output, width = 1000, height = 500) | |
309 plotFrequencies(nucleotideFrequenciesForward) | |
310 dev.off() | |
311 } | |
312 | |
313 close(f1) | |
314 | |
315 | |
316 | |
317 |