annotate single_fastq_filtering.R @ 21:f4ed6a65a2ff draft

Uploaded
author petr-novak
date Thu, 27 Jul 2023 09:46:13 +0000
parents 378565f5a875
children f1738f8649b0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env Rscript
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
2
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
3
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
4
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
5 library(optparse,quiet=TRUE)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
6 matricesSum = function(m1,m2){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
7 mfin = matrix(0, nrow = nrow(m2), ncol = max(ncol(m1), ncol(m2)))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
8 rownames(mfin) = rownames(m2)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
9 if (sum(m1)>0){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
10 mfin[,1:ncol(m1)] = mfin[,1:ncol(m1)] + m1
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
11 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
12 if (sum(m2)>0){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
13 mfin[,1:ncol(m2)] = mfin[,1:ncol(m2)] + m2
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
14 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
15
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
16 return(mfin)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
17 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
18
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
19 plotFrequencies = function(x){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
20 par(xaxs = 'i', yaxs = 'i')
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
21 plot(NULL, xlim = c(1,ncol(x)), ylim = c(0,1),type ='n', xlab="position", ylab = "proportion")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
22 col = c(A="red", "T"="orange", C="blue", G="green")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
23 for (n in c("A","C", "T","G")){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
24 points(x[n,]/colSums(x), type = 'l', col = col[n])
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
25 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
26 grid(ny = 50, nx = ncol(x) - 1 )
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
27 legend("topright", col = col, lwd = 2, legend = names(col))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
28 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
29
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
30 megablast = function(fx,params="-F \"m D\" -D 4 -p 90 ",database){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
31 # assume blastn and makeblastdb in $PATH
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
32 tmp_in = tempfile()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
33 tmp_out = tempfile()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
34 writeFasta(fx,file=tmp_in)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
35 ## check if db is present:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
36 dbfiles = paste0(database, ".nhr")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
37 if (!file.exists(dbfiles)){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
38 cat("running makeblastdb\n")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
39 system(paste0("makeblastdb -dbtype nucl -in ", database))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
40 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
41 params = '-num_alignments 1 -num_threads 4 -outfmt "6 qseqid qcovs pident" '
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
42 cmd=paste("blastn", " -db", database, "-query", tmp_in,
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
43 "-out", tmp_out, " ", params)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
44 status=system(cmd,intern=TRUE)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
45
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
46 if (file.info(tmp_out)$size != 0){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
47 results=read.table(tmp_out,sep="\t",as.is=TRUE,comment.char="")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
48 colnames(results) = c("qseqid", "qcovs", "pident")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
49 }else{
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
50 results=NULL
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
51 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
52 unlink(tmp_in)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
53 unlink(tmp_out)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
54 return(results)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
55 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
56
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
57
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
58
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
59 option_list = list(
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
60 make_option(c('-a', '--fastqA'),action='store',type='character',help='fastq file A',default=NA),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
61 make_option(c('-x', '--fastqX'),action='store',type='character',help='output fastq file X',default=NA),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
62 make_option(c('-c', '--cut_off'),action='store',type='numeric',help='Quality cut-off value [default %default]',default=10),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
63 make_option(c('-r', '--rnd_seed'),action='store',type='numeric',help='seed for random number generator [default %default]',default=123),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
64 make_option(c('-p', '--percent_above'),action='store',type='numeric',
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
65 help='Percent of bases in sequence that must have quality equal to / higher than cut-off value [default %default]',default=95),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
66 make_option(c("-G", "--png_file_output"), action = "store", type = "character", default=NA),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
67 make_option(c('-e', '--trim_end'), action='store',type='numeric',help="trimming - end position [no trimming by default]", default=NA),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
68 make_option(c('-s', '--trim_start'), action='store',type='numeric',help="triming position - start [no trimming by default]", default=NA),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
69
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
70 make_option(c('-n', '--sample_size'),action='store',type='numeric',help='requested sample size (number of pairs)[no sampling by default]',default=NULL),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
71 make_option(c('-N', '--Max_N'),action='store',type='integer',help='maximum number of Ns in sequences [default %default]',default=0),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
72 make_option(c('-f', '--format'),action='store',type='character',help='format of output - fastq or fasta [default %default] ',default="fasta"),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
73 make_option(c('-C', '--cutadapt_options'),action='store',type='character',help='file specifying cutadapt options',default=NULL),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
74 make_option(c('-F', '--filter_seq'),action='store',type='character',help='file specifying sequences for filtering (e.g. plastid DNA)',default=NULL),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
75 make_option(c('-j', '--chunk_size'),action='store',type='numeric',help='Number of sequences processed in single step. This option affect speed of processing and memory usage [default %default]',default=1000000)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
76
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
77
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
78
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
79
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
80
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
81 )
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
82
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
83 description=paste(strwrap('Script for filterinq fastq file f
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
84 '),collapse="\n")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
85 # arguments whic hcould be modifed
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
86 cutadapt_arguments=paste(
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
87 " --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' ",
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
88 " --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' ",
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
89 " --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' ",
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
90 " --anywhere='CAAGCAGAAGACGGCATACGAGAT' --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' ",
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
91 "--error-rate=0.05 --times=1 --overlap=15 --discard ",
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
92 " ",sep='')
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
93
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
94
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
95 epilogue=paste(strwrap(c('Description:\n
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
96 This tool is designed to make memory efficient preprocessing of single fastq files. Input files can be in GNU zipped archive (.gz extension).
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
97 Reads are filtered based on the quality, presence of N bases and adapters.
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
98 Cutaddapt program must be installed to use adapter filtering.
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
99 The input files are process in chunks,
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
100 All reads which pass the quality filter fill be writen into output file.
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
101 If --sample size is specified, only sample of sequences will be returned.
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
102 By default cutadapt us run with this options:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
103 \n ',
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
104 cutadapt_arguments,
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
105 "\n\nIf you want to use different adapter sequences specify them in separate file and use -C --cutadapt options.
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
106 Consult cutadapt manual for usage. ")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
107 ),collapse="\n")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
108
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
109 epilogue=paste(epilogue,"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
110
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
111
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
112 # example:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
113 TODO
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
114
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
115 ",sep='\n')
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
116
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
117
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
118 parser=OptionParser(
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
119 option_list=option_list,
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
120 epilogue=epilogue,
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
121 description=description,
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
122 )
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
123
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
124
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
125
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
126 opt = parse_args(parser, args=commandArgs(TRUE))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
127 if (!(opt$format %in% c("fasta","fastq"))){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
128 stop("wrong output format")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
129 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
130
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
131 if (any(is.na(c(opt$fastqA, opt$fastqX )))){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
132 cat("\nInput ond output files must be specified!\n\n")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
133 print_help(parser)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
134 q()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
135 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
136
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
137
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
138 if (!is.null(opt$cutadapt_options)){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
139 cutadapt_argumenns=scan(opt$cutadapt_options,what="character", comment.char="#")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
140 #remove "EOF"
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
141 cutadapt_arguments=paste(" ",cutadapt_arguments," ",collapse=' ',sep=" ")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
142 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
143
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
144
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
145 cutadapt_cmd=function(input,output,cutadapt_arguments){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
146 cmds=paste("cutadapt --format=",
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
147 "fastq",
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
148 cutadapt_arguments,
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
149 " ",
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
150 c(input),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
151 " -o ",
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
152 c(output),
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
153 sep="")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
154
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
155 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
156
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
157 set.seed(opt$rnd_seed)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
158
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
159 PROP=opt$percent_above/100
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
160 MINQ=opt$cut_off
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
161 TRIM_END=opt$trim_end
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
162 TRIM_START=opt$trim_start
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
163
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
164 CHUNK_SIZE=opt$chunk_size
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
165
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
166 MIN_LENGTH=TRIM_END - ifelse(is.na(TRIM_START),1,TRIM_START)+1
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
167 if (is.na(MIN_LENGTH)){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
168 MIN_LENGTH=1
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
169 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
170
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
171 suppressPackageStartupMessages(library(ShortRead))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
172 Qfilter=srFilter(function(x){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
173 rowSums(as(quality(x),'matrix')>MINQ,na.rm=TRUE)/nchar(sread(x))>=PROP
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
174 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
175 )
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
176
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
177 Min_Length_filter=srFilter(function(x){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
178 nchar(sread(x))>=MIN_LENGTH
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
179 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
180 )
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
181
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
182
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
183 fin_filter=compose(Qfilter,nFilter(opt$Max_N),Min_Length_filter)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
184
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
185 if (opt$format=="fasta"){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
186 writeFun=writeFasta
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
187 }else{
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
188 writeFun=writeFastq
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
189 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
190
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
191
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
192 f1out=opt$fastqX
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
193
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
194 #find wich character specify pair: - it could be
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
195
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
196 # check size of sequences - must be same...
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
197 n1=countLines(opt$fastqA)/4
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
198
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
199 cat("number of sequences in ",opt$fastqA,":",n1,"\n")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
200 cat("--------\n")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
201
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
202
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
203 number_of_chunks=round(n1/CHUNK_SIZE)
5
378565f5a875 Uploaded
petr-novak
parents: 0
diff changeset
204 CHUNK_SIZE = round(n1/number_of_chunks)
378565f5a875 Uploaded
petr-novak
parents: 0
diff changeset
205
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
206 if (number_of_chunks==0){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
207 CHUNK_SIZE=n1
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
208 number_of_chunks=1
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
209 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
210 if (!is.null(opt$sample_size)){
5
378565f5a875 Uploaded
petr-novak
parents: 0
diff changeset
211 sample_size_in_chunk=round(opt$sample_size/number_of_chunks)
378565f5a875 Uploaded
petr-novak
parents: 0
diff changeset
212 n_missing = opt$sample_size - sample_size_in_chunk * number_of_chunks
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
213 }else{
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
214 sample_size_in_chunk=CHUNK_SIZE
5
378565f5a875 Uploaded
petr-novak
parents: 0
diff changeset
215 n_missing = 0
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
216 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
217
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
218 # adjust the chunk size to get exact count of sequences:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
219 CHUNK_SIZE = ceiling(n1/number_of_chunks)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
220 save.image("tmp.RData")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
221
5
378565f5a875 Uploaded
petr-novak
parents: 0
diff changeset
222
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
223 f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
224 total=0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
225 nucleotideFrequenciesForward = matrix(0)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
226 while(TRUE){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
227 print (Sys.time())
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
228 fq1 <- yield(f1)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
229 print (Sys.time())
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
230 if (length(fq1)==0){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
231 break
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
232 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
233
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
234
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
235
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
236
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
237 if (!is.na(TRIM_END)){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
238 fq1=narrow(fq1,end=ifelse((TRIM_END)>width(fq1),width(fq1),TRIM_END))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
239 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
240 if (!is.na(TRIM_START)){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
241 fq1=narrow(fq1,start=ifelse((TRIM_START)<width(fq1),TRIM_START,width(fq1)))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
242 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
243
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
244
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
245
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
246 fqF1=fq1[fin_filter(fq1)]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
247
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
248 cat("running cutadapt on chunk\n")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
249 # cut addapt here: # filter parts:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
250 tmp_in1=tempfile(fileext=".fastq")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
251 tmp_out1=tempfile(fileext=".fastq")
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
252 if (is.null(formals(writeFastq)$compress)){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
253 writeFastq(fqF1,file=tmp_in1)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
254 }else{
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
255 writeFastq(fqF1,file=tmp_in1, compress = FALSE)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
256 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
257
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
258
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
259 cmd1=cutadapt_cmd(tmp_in1,tmp_out1, cutadapt_arguments)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
260
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
261 status=system(cmd1,intern=TRUE)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
262 print (Sys.time())
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
263 # collect results
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
264 ftmp1=FastqFile(tmp_out1)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
265
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
266 fqF1=readFastq(ftmp1)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
267 close(ftmp1)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
268
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
269 # remove sequences similar to filter database (e.g. plastid DNA)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
270 if (!is.null(opt$filter_seq)){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
271 blast_results = megablast(fqF1, database=opt$filter_seq)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
272 if (!is.null(blast_results)){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
273 exclude=with(blast_results, unique(qseqid[pident >= 90 & qcovs >=90]))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
274 ## note - blast will truncate seq ids - anything beyond space is omitted
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
275 seq_to_exclude = gsub(" .*","",id(fqF1)) %in% exclude
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
276 excluded_contamination=signif(sum(seq_to_exclude)/length(seq_to_exclude)*100,3)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
277 cat(excluded_contamination,"% filtered out after blast\n" )
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
278 fqF1 = fqF1[!seq_to_exclude]
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
279 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
280 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
281
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
282
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
283
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
284 # clean up
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
285 unlink(tmp_out1)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
286 unlink(tmp_in1)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
287
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
288
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
289
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
290
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
291 # filter complete pairs again:
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
292
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
293 # create new id - last character must differentiate pair - for interlacig
5
378565f5a875 Uploaded
petr-novak
parents: 0
diff changeset
294 if (length(fqF1)>(sample_size_in_chunk + n_missing)){
378565f5a875 Uploaded
petr-novak
parents: 0
diff changeset
295 smp=sort(sample(seq_along(fqF1),sample_size_in_chunk + n_missing))
378565f5a875 Uploaded
petr-novak
parents: 0
diff changeset
296 n_missing = 0
0
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
297 writeFun(fqF1[smp],file=f1out,mode='a')
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
298 nfrq1 = alphabetByCycle(sread(fqF1[smp]))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
299
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
300 }else{
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
301 writeFun(fqF1,file=f1out,mode='a')
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
302 nfrq1 = alphabetByCycle(sread(fqF1))
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
303 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
304 nucleotideFrequenciesForward = matricesSum(nucleotideFrequenciesForward, nfrq1)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
305
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
306 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
307
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
308 if( !is.na(opt$png_file_output)){
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
309 png(opt$png_file_output, width = 1000, height = 500)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
310 plotFrequencies(nucleotideFrequenciesForward)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
311 dev.off()
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
312 }
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
313
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
314 close(f1)
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
315
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
316
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
317
a4cd8608ef6b Uploaded
petr-novak
parents:
diff changeset
318