Mercurial > repos > petr-novak > re_utils
comparison single_fastq_filtering.R @ 5:378565f5a875 draft
Uploaded
author | petr-novak |
---|---|
date | Fri, 22 Nov 2019 07:56:48 -0500 |
parents | a4cd8608ef6b |
children | f1738f8649b0 |
comparison
equal
deleted
inserted
replaced
4:d397f5a85464 | 5:378565f5a875 |
---|---|
199 cat("number of sequences in ",opt$fastqA,":",n1,"\n") | 199 cat("number of sequences in ",opt$fastqA,":",n1,"\n") |
200 cat("--------\n") | 200 cat("--------\n") |
201 | 201 |
202 | 202 |
203 number_of_chunks=round(n1/CHUNK_SIZE) | 203 number_of_chunks=round(n1/CHUNK_SIZE) |
204 CHUNK_SIZE = round(n1/number_of_chunks) | |
205 | |
204 if (number_of_chunks==0){ | 206 if (number_of_chunks==0){ |
205 CHUNK_SIZE=n1 | 207 CHUNK_SIZE=n1 |
206 number_of_chunks=1 | 208 number_of_chunks=1 |
207 } | 209 } |
208 if (!is.null(opt$sample_size)){ | 210 if (!is.null(opt$sample_size)){ |
209 sample_size_in_chunk=opt$sample_size/number_of_chunks | 211 sample_size_in_chunk=round(opt$sample_size/number_of_chunks) |
212 n_missing = opt$sample_size - sample_size_in_chunk * number_of_chunks | |
210 }else{ | 213 }else{ |
211 sample_size_in_chunk=CHUNK_SIZE | 214 sample_size_in_chunk=CHUNK_SIZE |
215 n_missing = 0 | |
212 } | 216 } |
213 | 217 |
214 # adjust the chunk size to get exact count of sequences: | 218 # adjust the chunk size to get exact count of sequences: |
215 CHUNK_SIZE = ceiling(n1/number_of_chunks) | 219 CHUNK_SIZE = ceiling(n1/number_of_chunks) |
216 save.image("tmp.RData") | 220 save.image("tmp.RData") |
217 | 221 |
218 print("--------------------------------") | 222 |
219 print (sample_size_in_chunk) | |
220 print (opt$sample_size) | |
221 print (CHUNK_SIZE) | |
222 print("--------------------------------") | |
223 f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE) | 223 f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE) |
224 total=0 | 224 total=0 |
225 nucleotideFrequenciesForward = matrix(0) | 225 nucleotideFrequenciesForward = matrix(0) |
226 while(TRUE){ | 226 while(TRUE){ |
227 print (Sys.time()) | 227 print (Sys.time()) |
289 | 289 |
290 | 290 |
291 # filter complete pairs again: | 291 # filter complete pairs again: |
292 | 292 |
293 # create new id - last character must differentiate pair - for interlacig | 293 # create new id - last character must differentiate pair - for interlacig |
294 if (length(fqF1)>sample_size_in_chunk){ | 294 if (length(fqF1)>(sample_size_in_chunk + n_missing)){ |
295 smp=sort(sample(seq_along(fqF1),sample_size_in_chunk)) | 295 smp=sort(sample(seq_along(fqF1),sample_size_in_chunk + n_missing)) |
296 n_missing = 0 | |
296 writeFun(fqF1[smp],file=f1out,mode='a') | 297 writeFun(fqF1[smp],file=f1out,mode='a') |
297 nfrq1 = alphabetByCycle(sread(fqF1[smp])) | 298 nfrq1 = alphabetByCycle(sread(fqF1[smp])) |
298 | 299 |
299 }else{ | 300 }else{ |
300 writeFun(fqF1,file=f1out,mode='a') | 301 writeFun(fqF1,file=f1out,mode='a') |