diff single_fastq_filtering.R @ 0:a4cd8608ef6b draft

Uploaded
author petr-novak
date Mon, 01 Apr 2019 07:56:36 -0400
parents
children 378565f5a875
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/single_fastq_filtering.R	Mon Apr 01 07:56:36 2019 -0400
@@ -0,0 +1,317 @@
+#!/usr/bin/env Rscript
+
+
+
+library(optparse,quiet=TRUE)
+matricesSum = function(m1,m2){
+    mfin = matrix(0, nrow = nrow(m2), ncol = max(ncol(m1), ncol(m2)))
+    rownames(mfin) = rownames(m2)
+    if (sum(m1)>0){
+        mfin[,1:ncol(m1)] = mfin[,1:ncol(m1)] + m1
+    }
+    if (sum(m2)>0){
+        mfin[,1:ncol(m2)] = mfin[,1:ncol(m2)] + m2
+    }
+
+    return(mfin)
+}
+
+plotFrequencies = function(x){
+    par(xaxs = 'i', yaxs = 'i')
+    plot(NULL, xlim = c(1,ncol(x)), ylim = c(0,1),type ='n', xlab="position", ylab = "proportion")
+    col = c(A="red", "T"="orange", C="blue", G="green")
+    for (n in c("A","C", "T","G")){
+        points(x[n,]/colSums(x), type = 'l', col = col[n])
+    }
+    grid(ny = 50, nx = ncol(x) - 1  )
+    legend("topright", col = col, lwd = 2, legend = names(col))
+}
+
+megablast = function(fx,params="-F \"m D\" -D 4 -p 90 ",database){
+  # assume blastn and makeblastdb in $PATH
+	tmp_in = tempfile()
+	tmp_out = tempfile()
+	writeFasta(fx,file=tmp_in)
+  ## check if db is present:
+  dbfiles = paste0(database, ".nhr")
+  if (!file.exists(dbfiles)){
+    cat("running makeblastdb\n")
+    system(paste0("makeblastdb -dbtype nucl -in ", database))
+  }
+  params = '-num_alignments 1 -num_threads 4  -outfmt "6 qseqid qcovs pident" '
+	cmd=paste("blastn", " -db", database, "-query", tmp_in,
+            "-out", tmp_out, " ", params)
+  status=system(cmd,intern=TRUE)
+
+	if (file.info(tmp_out)$size != 0){
+    results=read.table(tmp_out,sep="\t",as.is=TRUE,comment.char="")
+    colnames(results) = c("qseqid", "qcovs", "pident")
+	}else{
+		results=NULL
+	}
+  unlink(tmp_in)
+	unlink(tmp_out)
+	return(results)
+}
+
+
+
+option_list = list(
+  make_option(c('-a', '--fastqA'),action='store',type='character',help='fastq file A',default=NA),
+  make_option(c('-x', '--fastqX'),action='store',type='character',help='output fastq file X',default=NA),
+  make_option(c('-c', '--cut_off'),action='store',type='numeric',help='Quality cut-off value [default %default]',default=10),
+  make_option(c('-r', '--rnd_seed'),action='store',type='numeric',help='seed for random number generator [default %default]',default=123),
+  make_option(c('-p', '--percent_above'),action='store',type='numeric',
+              help='Percent of bases in sequence that must have quality equal to / higher than cut-off value [default %default]',default=95),
+  make_option(c("-G", "--png_file_output"), action = "store", type = "character", default=NA),
+  make_option(c('-e', '--trim_end'), action='store',type='numeric',help="trimming - end position [no trimming by default]", default=NA),
+  make_option(c('-s', '--trim_start'), action='store',type='numeric',help="triming position - start  [no trimming by default]", default=NA),
+
+  make_option(c('-n', '--sample_size'),action='store',type='numeric',help='requested sample size (number of pairs)[no sampling by default]',default=NULL),
+  make_option(c('-N', '--Max_N'),action='store',type='integer',help='maximum number of Ns in sequences [default %default]',default=0),
+  make_option(c('-f', '--format'),action='store',type='character',help='format of output - fastq or fasta [default %default] ',default="fasta"),
+  make_option(c('-C', '--cutadapt_options'),action='store',type='character',help='file specifying cutadapt options',default=NULL),
+  make_option(c('-F', '--filter_seq'),action='store',type='character',help='file specifying sequences for filtering (e.g. plastid DNA)',default=NULL),
+  make_option(c('-j', '--chunk_size'),action='store',type='numeric',help='Number of sequences processed in single step. This option affect speed of processing and memory usage [default %default]',default=1000000)
+
+
+
+
+
+)
+
+description=paste(strwrap('Script for filterinq fastq file f
+						'),collapse="\n")
+# arguments whic hcould be modifed
+cutadapt_arguments=paste(
+		" --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' ",
+		" --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'  ",
+		" --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' ",
+		" --anywhere='CAAGCAGAAGACGGCATACGAGAT' --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' ",
+		"--error-rate=0.05 --times=1 --overlap=15 --discard ",
+		" ",sep='')
+
+
+epilogue=paste(strwrap(c('Description:\n
+This tool is designed to make memory efficient preprocessing of single fastq files. Input files can be in GNU zipped archive (.gz extension).
+Reads are filtered based on the quality, presence of N bases and adapters.
+Cutaddapt program must be installed to use adapter filtering.
+The input files are process in chunks,
+All reads which pass the quality filter fill be writen into output file.
+If --sample size is specified, only sample of sequences will be returned.
+By default cutadapt us run with this options:
+\n ',
+cutadapt_arguments,
+"\n\nIf you want to use different adapter sequences specify them in separate file and use -C --cutadapt options.
+Consult cutadapt manual for usage. ")
+),collapse="\n")
+
+epilogue=paste(epilogue,"
+
+
+# example:
+               TODO
+
+",sep='\n')
+
+
+parser=OptionParser(
+		option_list=option_list,
+		epilogue=epilogue,
+		description=description,
+)
+
+
+
+opt = parse_args(parser, args=commandArgs(TRUE))
+if  (!(opt$format %in% c("fasta","fastq"))){
+	stop("wrong output format")
+}
+
+if (any(is.na(c(opt$fastqA, opt$fastqX )))){
+	cat("\nInput ond output files must be specified!\n\n")
+	print_help(parser)
+	q()
+}
+
+
+if (!is.null(opt$cutadapt_options)){
+	cutadapt_argumenns=scan(opt$cutadapt_options,what="character", comment.char="#")
+	#remove "EOF"
+	cutadapt_arguments=paste(" ",cutadapt_arguments," ",collapse=' ',sep=" ")
+}
+
+
+cutadapt_cmd=function(input,output,cutadapt_arguments){
+	cmds=paste("cutadapt --format=",
+			"fastq",
+			cutadapt_arguments,
+			" ",
+			c(input),
+			" -o ",
+			c(output),
+			sep="")
+
+}
+
+set.seed(opt$rnd_seed)
+
+PROP=opt$percent_above/100
+MINQ=opt$cut_off
+TRIM_END=opt$trim_end
+TRIM_START=opt$trim_start
+
+CHUNK_SIZE=opt$chunk_size
+
+MIN_LENGTH=TRIM_END - ifelse(is.na(TRIM_START),1,TRIM_START)+1
+if (is.na(MIN_LENGTH)){
+	MIN_LENGTH=1
+}
+
+suppressPackageStartupMessages(library(ShortRead))
+Qfilter=srFilter(function(x){
+			rowSums(as(quality(x),'matrix')>MINQ,na.rm=TRUE)/nchar(sread(x))>=PROP
+		}
+)
+
+Min_Length_filter=srFilter(function(x){
+			nchar(sread(x))>=MIN_LENGTH
+		}
+)
+
+
+fin_filter=compose(Qfilter,nFilter(opt$Max_N),Min_Length_filter)
+
+if (opt$format=="fasta"){
+	writeFun=writeFasta
+}else{
+	writeFun=writeFastq
+}
+
+
+f1out=opt$fastqX
+
+#find wich character specify pair: - it could be
+
+# check size of sequences - must be same...
+n1=countLines(opt$fastqA)/4
+
+cat("number of sequences in ",opt$fastqA,":",n1,"\n")
+cat("--------\n")
+
+
+number_of_chunks=round(n1/CHUNK_SIZE)
+if (number_of_chunks==0){
+	CHUNK_SIZE=n1
+	number_of_chunks=1
+}
+if (!is.null(opt$sample_size)){
+	sample_size_in_chunk=opt$sample_size/number_of_chunks
+}else{
+	sample_size_in_chunk=CHUNK_SIZE
+}
+
+# adjust the chunk size to get exact count of sequences:
+CHUNK_SIZE = ceiling(n1/number_of_chunks)
+save.image("tmp.RData")
+
+print("--------------------------------")
+print (sample_size_in_chunk)
+print (opt$sample_size)
+print (CHUNK_SIZE)
+print("--------------------------------")
+f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE)
+total=0
+nucleotideFrequenciesForward = matrix(0)
+while(TRUE){
+	print (Sys.time())
+	fq1 <- yield(f1)
+	print (Sys.time())
+	if (length(fq1)==0){
+		break
+	}
+	
+	
+	
+	
+	if (!is.na(TRIM_END)){
+		fq1=narrow(fq1,end=ifelse((TRIM_END)>width(fq1),width(fq1),TRIM_END))
+	}
+	if (!is.na(TRIM_START)){
+		fq1=narrow(fq1,start=ifelse((TRIM_START)<width(fq1),TRIM_START,width(fq1)))
+	}
+	
+
+
+	fqF1=fq1[fin_filter(fq1)]
+
+	cat("running cutadapt on chunk\n")
+	# cut addapt here: # filter parts:
+	tmp_in1=tempfile(fileext=".fastq")
+	tmp_out1=tempfile(fileext=".fastq")
+        if (is.null(formals(writeFastq)$compress)){
+            writeFastq(fqF1,file=tmp_in1)
+        }else{
+            writeFastq(fqF1,file=tmp_in1, compress = FALSE)
+        }
+
+
+	cmd1=cutadapt_cmd(tmp_in1,tmp_out1, cutadapt_arguments)
+
+	status=system(cmd1,intern=TRUE)
+	print (Sys.time())
+	# collect results
+	ftmp1=FastqFile(tmp_out1)
+
+	fqF1=readFastq(ftmp1)
+	close(ftmp1)
+
+  # remove sequences similar to filter database (e.g. plastid DNA)
+  if (!is.null(opt$filter_seq)){
+		blast_results =  megablast(fqF1, database=opt$filter_seq)
+		if (!is.null(blast_results)){
+			exclude=with(blast_results, unique(qseqid[pident >= 90 & qcovs >=90]))
+      ## note - blast will truncate seq ids - anything beyond space is omitted
+			seq_to_exclude = gsub(" .*","",id(fqF1)) %in% exclude
+			excluded_contamination=signif(sum(seq_to_exclude)/length(seq_to_exclude)*100,3)
+      cat(excluded_contamination,"% filtered out after blast\n" )
+			fqF1 = fqF1[!seq_to_exclude]
+		}
+	}
+
+
+
+	# clean up
+	unlink(tmp_out1)
+	unlink(tmp_in1)
+
+
+  
+
+	# filter complete pairs again:
+
+	# create new id - last character must differentiate pair - for interlacig
+	if (length(fqF1)>sample_size_in_chunk){
+		smp=sort(sample(seq_along(fqF1),sample_size_in_chunk))
+		writeFun(fqF1[smp],file=f1out,mode='a')
+    nfrq1 = alphabetByCycle(sread(fqF1[smp]))
+
+	}else{
+      writeFun(fqF1,file=f1out,mode='a')
+      nfrq1 = alphabetByCycle(sread(fqF1))
+	}
+  nucleotideFrequenciesForward = matricesSum(nucleotideFrequenciesForward, nfrq1)
+
+}
+
+if( !is.na(opt$png_file_output)){
+    png(opt$png_file_output, width = 1000, height = 500)
+    plotFrequencies(nucleotideFrequenciesForward)
+    dev.off()
+}
+
+close(f1)
+
+
+
+