comparison single_fastq_filtering.R @ 5:378565f5a875 draft

Uploaded
author petr-novak
date Fri, 22 Nov 2019 07:56:48 -0500
parents a4cd8608ef6b
children f1738f8649b0
comparison
equal deleted inserted replaced
4:d397f5a85464 5:378565f5a875
199 cat("number of sequences in ",opt$fastqA,":",n1,"\n") 199 cat("number of sequences in ",opt$fastqA,":",n1,"\n")
200 cat("--------\n") 200 cat("--------\n")
201 201
202 202
203 number_of_chunks=round(n1/CHUNK_SIZE) 203 number_of_chunks=round(n1/CHUNK_SIZE)
204 CHUNK_SIZE = round(n1/number_of_chunks)
205
204 if (number_of_chunks==0){ 206 if (number_of_chunks==0){
205 CHUNK_SIZE=n1 207 CHUNK_SIZE=n1
206 number_of_chunks=1 208 number_of_chunks=1
207 } 209 }
208 if (!is.null(opt$sample_size)){ 210 if (!is.null(opt$sample_size)){
209 sample_size_in_chunk=opt$sample_size/number_of_chunks 211 sample_size_in_chunk=round(opt$sample_size/number_of_chunks)
212 n_missing = opt$sample_size - sample_size_in_chunk * number_of_chunks
210 }else{ 213 }else{
211 sample_size_in_chunk=CHUNK_SIZE 214 sample_size_in_chunk=CHUNK_SIZE
215 n_missing = 0
212 } 216 }
213 217
214 # adjust the chunk size to get exact count of sequences: 218 # adjust the chunk size to get exact count of sequences:
215 CHUNK_SIZE = ceiling(n1/number_of_chunks) 219 CHUNK_SIZE = ceiling(n1/number_of_chunks)
216 save.image("tmp.RData") 220 save.image("tmp.RData")
217 221
218 print("--------------------------------") 222
219 print (sample_size_in_chunk)
220 print (opt$sample_size)
221 print (CHUNK_SIZE)
222 print("--------------------------------")
223 f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE) 223 f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE)
224 total=0 224 total=0
225 nucleotideFrequenciesForward = matrix(0) 225 nucleotideFrequenciesForward = matrix(0)
226 while(TRUE){ 226 while(TRUE){
227 print (Sys.time()) 227 print (Sys.time())
289 289
290 290
291 # filter complete pairs again: 291 # filter complete pairs again:
292 292
293 # create new id - last character must differentiate pair - for interlacig 293 # create new id - last character must differentiate pair - for interlacig
294 if (length(fqF1)>sample_size_in_chunk){ 294 if (length(fqF1)>(sample_size_in_chunk + n_missing)){
295 smp=sort(sample(seq_along(fqF1),sample_size_in_chunk)) 295 smp=sort(sample(seq_along(fqF1),sample_size_in_chunk + n_missing))
296 n_missing = 0
296 writeFun(fqF1[smp],file=f1out,mode='a') 297 writeFun(fqF1[smp],file=f1out,mode='a')
297 nfrq1 = alphabetByCycle(sread(fqF1[smp])) 298 nfrq1 = alphabetByCycle(sread(fqF1[smp]))
298 299
299 }else{ 300 }else{
300 writeFun(fqF1,file=f1out,mode='a') 301 writeFun(fqF1,file=f1out,mode='a')