0
|
1 #!/usr/bin/env Rscript
|
|
2
|
|
3
|
|
4
|
|
5 library(optparse,quiet=TRUE)
|
|
6 matricesSum = function(m1,m2){
|
|
7 mfin = matrix(0, nrow = nrow(m2), ncol = max(ncol(m1), ncol(m2)))
|
|
8 rownames(mfin) = rownames(m2)
|
|
9 if (sum(m1)>0){
|
|
10 mfin[,1:ncol(m1)] = mfin[,1:ncol(m1)] + m1
|
|
11 }
|
|
12 if (sum(m2)>0){
|
|
13 mfin[,1:ncol(m2)] = mfin[,1:ncol(m2)] + m2
|
|
14 }
|
|
15
|
|
16 return(mfin)
|
|
17 }
|
|
18
|
|
19 plotFrequencies = function(x){
|
|
20 par(xaxs = 'i', yaxs = 'i')
|
|
21 plot(NULL, xlim = c(1,ncol(x)), ylim = c(0,1),type ='n', xlab="position", ylab = "proportion")
|
|
22 col = c(A="red", "T"="orange", C="blue", G="green")
|
|
23 for (n in c("A","C", "T","G")){
|
|
24 points(x[n,]/colSums(x), type = 'l', col = col[n])
|
|
25 }
|
|
26 grid(ny = 50, nx = ncol(x) - 1 )
|
|
27 legend("topright", col = col, lwd = 2, legend = names(col))
|
|
28 }
|
|
29
|
|
30 megablast = function(fx,params="-F \"m D\" -D 4 -p 90 ",database){
|
|
31 # assume blastn and makeblastdb in $PATH
|
|
32 tmp_in = tempfile()
|
|
33 tmp_out = tempfile()
|
|
34 writeFasta(fx,file=tmp_in)
|
|
35 ## check if db is present:
|
|
36 dbfiles = paste0(database, ".nhr")
|
|
37 if (!file.exists(dbfiles)){
|
|
38 cat("running makeblastdb\n")
|
|
39 system(paste0("makeblastdb -dbtype nucl -in ", database))
|
|
40 }
|
|
41 params = '-num_alignments 1 -num_threads 4 -outfmt "6 qseqid qcovs pident" '
|
|
42 cmd=paste("blastn", " -db", database, "-query", tmp_in,
|
|
43 "-out", tmp_out, " ", params)
|
|
44 status=system(cmd,intern=TRUE)
|
|
45
|
|
46 if (file.info(tmp_out)$size != 0){
|
|
47 results=read.table(tmp_out,sep="\t",as.is=TRUE,comment.char="")
|
|
48 colnames(results) = c("qseqid", "qcovs", "pident")
|
|
49 }else{
|
|
50 results=NULL
|
|
51 }
|
|
52 unlink(tmp_in)
|
|
53 unlink(tmp_out)
|
|
54 return(results)
|
|
55 }
|
|
56
|
|
57
|
|
58
|
|
59 option_list = list(
|
|
60 make_option(c('-a', '--fastqA'),action='store',type='character',help='fastq file A',default=NA),
|
|
61 make_option(c('-x', '--fastqX'),action='store',type='character',help='output fastq file X',default=NA),
|
|
62 make_option(c('-c', '--cut_off'),action='store',type='numeric',help='Quality cut-off value [default %default]',default=10),
|
|
63 make_option(c('-r', '--rnd_seed'),action='store',type='numeric',help='seed for random number generator [default %default]',default=123),
|
|
64 make_option(c('-p', '--percent_above'),action='store',type='numeric',
|
|
65 help='Percent of bases in sequence that must have quality equal to / higher than cut-off value [default %default]',default=95),
|
|
66 make_option(c("-G", "--png_file_output"), action = "store", type = "character", default=NA),
|
|
67 make_option(c('-e', '--trim_end'), action='store',type='numeric',help="trimming - end position [no trimming by default]", default=NA),
|
|
68 make_option(c('-s', '--trim_start'), action='store',type='numeric',help="triming position - start [no trimming by default]", default=NA),
|
|
69
|
|
70 make_option(c('-n', '--sample_size'),action='store',type='numeric',help='requested sample size (number of pairs)[no sampling by default]',default=NULL),
|
|
71 make_option(c('-N', '--Max_N'),action='store',type='integer',help='maximum number of Ns in sequences [default %default]',default=0),
|
|
72 make_option(c('-f', '--format'),action='store',type='character',help='format of output - fastq or fasta [default %default] ',default="fasta"),
|
|
73 make_option(c('-C', '--cutadapt_options'),action='store',type='character',help='file specifying cutadapt options',default=NULL),
|
|
74 make_option(c('-F', '--filter_seq'),action='store',type='character',help='file specifying sequences for filtering (e.g. plastid DNA)',default=NULL),
|
|
75 make_option(c('-j', '--chunk_size'),action='store',type='numeric',help='Number of sequences processed in single step. This option affect speed of processing and memory usage [default %default]',default=1000000)
|
|
76
|
|
77
|
|
78
|
|
79
|
|
80
|
|
81 )
|
|
82
|
|
83 description=paste(strwrap('Script for filterinq fastq file f
|
|
84 '),collapse="\n")
|
|
85 # arguments whic hcould be modifed
|
|
86 cutadapt_arguments=paste(
|
|
87 " --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' ",
|
|
88 " --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' ",
|
|
89 " --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' ",
|
|
90 " --anywhere='CAAGCAGAAGACGGCATACGAGAT' --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' ",
|
|
91 "--error-rate=0.05 --times=1 --overlap=15 --discard ",
|
|
92 " ",sep='')
|
|
93
|
|
94
|
|
95 epilogue=paste(strwrap(c('Description:\n
|
|
96 This tool is designed to make memory efficient preprocessing of single fastq files. Input files can be in GNU zipped archive (.gz extension).
|
|
97 Reads are filtered based on the quality, presence of N bases and adapters.
|
|
98 Cutaddapt program must be installed to use adapter filtering.
|
|
99 The input files are process in chunks,
|
|
100 All reads which pass the quality filter fill be writen into output file.
|
|
101 If --sample size is specified, only sample of sequences will be returned.
|
|
102 By default cutadapt us run with this options:
|
|
103 \n ',
|
|
104 cutadapt_arguments,
|
|
105 "\n\nIf you want to use different adapter sequences specify them in separate file and use -C --cutadapt options.
|
|
106 Consult cutadapt manual for usage. ")
|
|
107 ),collapse="\n")
|
|
108
|
|
109 epilogue=paste(epilogue,"
|
|
110
|
|
111
|
|
112 # example:
|
|
113 TODO
|
|
114
|
|
115 ",sep='\n')
|
|
116
|
|
117
|
|
118 parser=OptionParser(
|
|
119 option_list=option_list,
|
|
120 epilogue=epilogue,
|
|
121 description=description,
|
|
122 )
|
|
123
|
|
124
|
|
125
|
|
126 opt = parse_args(parser, args=commandArgs(TRUE))
|
|
127 if (!(opt$format %in% c("fasta","fastq"))){
|
|
128 stop("wrong output format")
|
|
129 }
|
|
130
|
|
131 if (any(is.na(c(opt$fastqA, opt$fastqX )))){
|
|
132 cat("\nInput ond output files must be specified!\n\n")
|
|
133 print_help(parser)
|
|
134 q()
|
|
135 }
|
|
136
|
|
137
|
|
138 if (!is.null(opt$cutadapt_options)){
|
|
139 cutadapt_argumenns=scan(opt$cutadapt_options,what="character", comment.char="#")
|
|
140 #remove "EOF"
|
|
141 cutadapt_arguments=paste(" ",cutadapt_arguments," ",collapse=' ',sep=" ")
|
|
142 }
|
|
143
|
|
144
|
|
145 cutadapt_cmd=function(input,output,cutadapt_arguments){
|
|
146 cmds=paste("cutadapt --format=",
|
|
147 "fastq",
|
|
148 cutadapt_arguments,
|
|
149 " ",
|
|
150 c(input),
|
|
151 " -o ",
|
|
152 c(output),
|
|
153 sep="")
|
|
154
|
|
155 }
|
|
156
|
|
157 set.seed(opt$rnd_seed)
|
|
158
|
|
159 PROP=opt$percent_above/100
|
|
160 MINQ=opt$cut_off
|
|
161 TRIM_END=opt$trim_end
|
|
162 TRIM_START=opt$trim_start
|
|
163
|
|
164 CHUNK_SIZE=opt$chunk_size
|
|
165
|
|
166 MIN_LENGTH=TRIM_END - ifelse(is.na(TRIM_START),1,TRIM_START)+1
|
|
167 if (is.na(MIN_LENGTH)){
|
|
168 MIN_LENGTH=1
|
|
169 }
|
|
170
|
|
171 suppressPackageStartupMessages(library(ShortRead))
|
|
172 Qfilter=srFilter(function(x){
|
|
173 rowSums(as(quality(x),'matrix')>MINQ,na.rm=TRUE)/nchar(sread(x))>=PROP
|
|
174 }
|
|
175 )
|
|
176
|
|
177 Min_Length_filter=srFilter(function(x){
|
|
178 nchar(sread(x))>=MIN_LENGTH
|
|
179 }
|
|
180 )
|
|
181
|
|
182
|
|
183 fin_filter=compose(Qfilter,nFilter(opt$Max_N),Min_Length_filter)
|
|
184
|
|
185 if (opt$format=="fasta"){
|
|
186 writeFun=writeFasta
|
|
187 }else{
|
|
188 writeFun=writeFastq
|
|
189 }
|
|
190
|
|
191
|
|
192 f1out=opt$fastqX
|
|
193
|
|
194 #find wich character specify pair: - it could be
|
|
195
|
|
196 # check size of sequences - must be same...
|
|
197 n1=countLines(opt$fastqA)/4
|
|
198
|
|
199 cat("number of sequences in ",opt$fastqA,":",n1,"\n")
|
|
200 cat("--------\n")
|
|
201
|
|
202
|
|
203 number_of_chunks=round(n1/CHUNK_SIZE)
|
5
|
204 CHUNK_SIZE = round(n1/number_of_chunks)
|
|
205
|
0
|
206 if (number_of_chunks==0){
|
|
207 CHUNK_SIZE=n1
|
|
208 number_of_chunks=1
|
|
209 }
|
|
210 if (!is.null(opt$sample_size)){
|
5
|
211 sample_size_in_chunk=round(opt$sample_size/number_of_chunks)
|
|
212 n_missing = opt$sample_size - sample_size_in_chunk * number_of_chunks
|
0
|
213 }else{
|
|
214 sample_size_in_chunk=CHUNK_SIZE
|
5
|
215 n_missing = 0
|
0
|
216 }
|
|
217
|
|
218 # adjust the chunk size to get exact count of sequences:
|
|
219 CHUNK_SIZE = ceiling(n1/number_of_chunks)
|
|
220 save.image("tmp.RData")
|
|
221
|
5
|
222
|
0
|
223 f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE)
|
|
224 total=0
|
|
225 nucleotideFrequenciesForward = matrix(0)
|
|
226 while(TRUE){
|
|
227 print (Sys.time())
|
|
228 fq1 <- yield(f1)
|
|
229 print (Sys.time())
|
|
230 if (length(fq1)==0){
|
|
231 break
|
|
232 }
|
|
233
|
|
234
|
|
235
|
|
236
|
|
237 if (!is.na(TRIM_END)){
|
|
238 fq1=narrow(fq1,end=ifelse((TRIM_END)>width(fq1),width(fq1),TRIM_END))
|
|
239 }
|
|
240 if (!is.na(TRIM_START)){
|
|
241 fq1=narrow(fq1,start=ifelse((TRIM_START)<width(fq1),TRIM_START,width(fq1)))
|
|
242 }
|
|
243
|
|
244
|
|
245
|
|
246 fqF1=fq1[fin_filter(fq1)]
|
|
247
|
|
248 cat("running cutadapt on chunk\n")
|
|
249 # cut addapt here: # filter parts:
|
|
250 tmp_in1=tempfile(fileext=".fastq")
|
|
251 tmp_out1=tempfile(fileext=".fastq")
|
|
252 if (is.null(formals(writeFastq)$compress)){
|
|
253 writeFastq(fqF1,file=tmp_in1)
|
|
254 }else{
|
|
255 writeFastq(fqF1,file=tmp_in1, compress = FALSE)
|
|
256 }
|
|
257
|
|
258
|
|
259 cmd1=cutadapt_cmd(tmp_in1,tmp_out1, cutadapt_arguments)
|
|
260
|
|
261 status=system(cmd1,intern=TRUE)
|
|
262 print (Sys.time())
|
|
263 # collect results
|
|
264 ftmp1=FastqFile(tmp_out1)
|
|
265
|
|
266 fqF1=readFastq(ftmp1)
|
|
267 close(ftmp1)
|
|
268
|
|
269 # remove sequences similar to filter database (e.g. plastid DNA)
|
|
270 if (!is.null(opt$filter_seq)){
|
|
271 blast_results = megablast(fqF1, database=opt$filter_seq)
|
|
272 if (!is.null(blast_results)){
|
|
273 exclude=with(blast_results, unique(qseqid[pident >= 90 & qcovs >=90]))
|
|
274 ## note - blast will truncate seq ids - anything beyond space is omitted
|
|
275 seq_to_exclude = gsub(" .*","",id(fqF1)) %in% exclude
|
|
276 excluded_contamination=signif(sum(seq_to_exclude)/length(seq_to_exclude)*100,3)
|
|
277 cat(excluded_contamination,"% filtered out after blast\n" )
|
|
278 fqF1 = fqF1[!seq_to_exclude]
|
|
279 }
|
|
280 }
|
|
281
|
|
282
|
|
283
|
|
284 # clean up
|
|
285 unlink(tmp_out1)
|
|
286 unlink(tmp_in1)
|
|
287
|
|
288
|
|
289
|
|
290
|
|
291 # filter complete pairs again:
|
|
292
|
|
293 # create new id - last character must differentiate pair - for interlacig
|
5
|
294 if (length(fqF1)>(sample_size_in_chunk + n_missing)){
|
|
295 smp=sort(sample(seq_along(fqF1),sample_size_in_chunk + n_missing))
|
|
296 n_missing = 0
|
0
|
297 writeFun(fqF1[smp],file=f1out,mode='a')
|
|
298 nfrq1 = alphabetByCycle(sread(fqF1[smp]))
|
|
299
|
|
300 }else{
|
|
301 writeFun(fqF1,file=f1out,mode='a')
|
|
302 nfrq1 = alphabetByCycle(sread(fqF1))
|
|
303 }
|
|
304 nucleotideFrequenciesForward = matricesSum(nucleotideFrequenciesForward, nfrq1)
|
|
305
|
|
306 }
|
|
307
|
|
308 if( !is.na(opt$png_file_output)){
|
|
309 png(opt$png_file_output, width = 1000, height = 500)
|
|
310 plotFrequencies(nucleotideFrequenciesForward)
|
|
311 dev.off()
|
|
312 }
|
|
313
|
|
314 close(f1)
|
|
315
|
|
316
|
|
317
|
|
318
|