diff paired_fastq_filtering.R @ 0:a4cd8608ef6b draft

Uploaded
author petr-novak
date Mon, 01 Apr 2019 07:56:36 -0400
parents
children 378565f5a875
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/paired_fastq_filtering.R	Mon Apr 01 07:56:36 2019 -0400
@@ -0,0 +1,404 @@
+#!/usr/bin/env Rscript
+# TODO - add triming from start and end, removing short reads
+library(optparse, quiet = TRUE)
+matricesSum = function(m1,m2){
+    mfin = matrix(0, nrow = nrow(m2), ncol = max(ncol(m1), ncol(m2)))
+    rownames(mfin) = rownames(m2)
+    if (sum(m1)>0){
+        mfin[,1:ncol(m1)] = mfin[,1:ncol(m1)] + m1
+    }
+    if (sum(m2)>0){
+        mfin[,1:ncol(m2)] = mfin[,1:ncol(m2)] + m2
+    }
+
+    return(mfin)
+}
+megablast = function(fx,params="-F \"m D\" -D 4 -p 90 ",database){
+                                        # assume blastn and makeblastdb in $PATH
+	tmp_in = tempfile()
+	tmp_out = tempfile()
+	writeFasta(fx,file=tmp_in)
+  ## check if db is present:
+  dbfiles = paste0(database, ".nhr")
+  if (!file.exists(dbfiles)){
+    cat("running makeblastdb\n")
+    system(paste0("makeblastdb -dbtype nucl -in ", database))
+  }
+  params = '-num_alignments 1 -num_threads 4  -outfmt "6 qseqid qcovs pident" '
+	cmd=paste("blastn", " -db", database, "-query", tmp_in,
+            "-out", tmp_out, " ", params)
+  status=system(cmd,intern=TRUE)
+	if (file.info(tmp_out)$size != 0){
+    results=read.table(tmp_out,sep="\t",as.is=TRUE,comment.char="")
+    colnames(results) = c("qseqid", "qcovs", "pident")
+	}else{
+		results=NULL
+	}
+  unlink(tmp_in)
+	unlink(tmp_out)
+	return(results)
+}
+
+
+plotFrequencies = function(x){
+    par(xaxs = 'i', yaxs = 'i')
+    plot(NULL, xlim = c(1,ncol(x)), ylim = c(0,1),type ='n', xlab="position", ylab = "proportion")
+    col = c(A="red", "T"="orange", C="blue", G="green")
+    for (n in c("A","C", "T","G")){
+        points(x[n,]/colSums(x), type = 'l', col = col[n])
+    }
+    grid(ny = 50, nx = ncol(x) - 1  )
+    legend("topright", col = col, lwd = 2, legend = names(col))
+}
+
+option_list = list(make_option(c("-a", "--fastqA"), action = "store", type = "character", 
+    help = "fastq file A", default = NA), make_option(c("-b", "--fastqB"), action = "store", 
+    type = "character", help = "fastq file B", default = NA), make_option(c("-x", 
+    "--fastqX"), action = "store", type = "character", help = "output fastq file X", 
+    default = NA), make_option(c("-y", "--fastqY"), action = "store", type = "character", 
+    help = "output fastq file Y", default = NA), make_option(c("-c", "--cut_off"), 
+    action = "store", type = "numeric", help = "Quality cut-off value [default %default]", 
+    default = 10), make_option(c("-r", "--rnd_seed"), action = "store", type = "numeric", 
+                               help = "seed for random number generator [default %default]", default = 123),
+    make_option(c("-G", "--png_file_output"), action = "store", type = "character", default=NA),
+    make_option(c("-p", "--percent_above"), action = "store", type = "numeric", help = "Percent of bases in sequence that must have quality equal to / higher than cut-off value [default %default]", 
+        default = 95), make_option(c("-e", "--trim_end"), action = "store", type = "numeric", 
+        help = "trimming - end position [no trimming by default]", default = NA), 
+    make_option(c("-s", "--trim_start"), action = "store", type = "numeric", help = "triming position - start  [no trimming by default]", 
+        default = NA), make_option(c("-n", "--sample_size"), action = "store", type = "numeric", 
+        help = "requested sample size (number of pairs)[no sampling by default]", 
+        default = NULL), make_option(c("-N", "--Max_N"), action = "store", type = "integer", 
+        help = "maximum number of Ns in sequences [default %default]", default = 0), 
+    make_option(c("-R", "--rename"), action = "store_true", type = "logical", help = "Rename sequences", 
+        default = FALSE), make_option(c("-f", "--format"), action = "store", type = "character", 
+        help = "format of output - fastq or fasta [default %default] ", default = "fasta"), 
+    make_option(c("-C", "--cutadapt_options"), action = "store", type = "character", 
+        help = "file specifying cutadapt options", default = NULL), make_option(c("-j", 
+        "--chunk_size"), action = "store", type = "numeric", help = "Number of sequences processed in single step. This option affect speed of processing and memory usage [default %default]", 
+        default = 1e+06),
+    make_option(c('-F', '--filter_seq'),action='store',type='character',help='file specifying sequences for filtering (e.g. plastid DNA)',default=NULL)
+    )
+
+description = paste(strwrap("Script for filterinq fastq files generated from paired end sequencing
+\t\t\t\t\t\t"), 
+    collapse = "\n")
+# arguments whic hcould be modifed
+cutadapt_arguments = paste(" --anywhere='AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' ", 
+    " --anywhere='AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'  ", 
+    " --anywhere='GATCGGAAGAGCACACGTCTGAACTCCAGTCAC' --anywhere='ATCTCGTATGCCGTCTTCTGCTTG' ", 
+    " --anywhere='CAAGCAGAAGACGGCATACGAGAT' --anywhere='GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC' ", 
+    "--error-rate=0.05 --times=1 --overlap=15 --discard ", " ", sep = "")
+
+
+epilogue = paste(strwrap(c("Description:\n
+This tool is designed to make memory efficient preprocessing of two fastq files. Input files can be in GNU zipped archive (.gz extension).
+Reads are filtered based on the quality, presence of N bases and adapters.
+Two input fastq files are procesed in parallel. Cutaddapt program must be installed to use adapter filtering.
+Only complete pair are kept. As the input files are process in chunks,
+it is required that pair reads are complete and in the same order in both input files. All reads which pass the quality
+filter fill be writen into ouytput files. If --sample size is specified, only sample of sequences will be returned.
+By default cutadapt us run with this options:
+\n ", 
+    cutadapt_arguments, "\n\nIf you want to use different adapter sequences specify them in separate file and use -C --cutadapt options.
+Consult cutadapt manual for usage. ")), 
+    collapse = "\n")
+
+epilogue = paste(epilogue, "
+fastqA                    fastqB
+   |                         |
+   V                         V
+----------------------------------
+|      Trimming (optional)       |
+----------------------------------
+   |                         |
+   |                         |
+   V                         V
+----------------------------------
+|        Filter by quality       |
+----------------------------------
+   |                         |
+   |                         |
+   V                         V
+---------------------------------
+|        Discard single reads   |--->
+|        Keep complete pairs    |
+---------------------------------
+   |                         |
+   V                         V
+----------------------------------
+|        cutadapt filtering    |
+----------------------------------
+   |                         |
+   V                         V
+---------------------------------
+|        Discard single reads   |--->
+|        Keep complete pairs    |
+---------------------------------
+   |                         |
+   |                         |
+   V                         V
+----------------------------------
+|      sample (optional)         |
+----------------------------------
+   |                         |
+   |                         |
+   V                         V
+ FastqX                    FastqY
+(or fastaX)             (or fastaY)
+
+# example:
+
+", 
+    sep = "\n")
+
+
+parser = OptionParser(option_list = option_list, epilogue = epilogue, description = description, 
+    )
+
+
+
+opt = parse_args(parser, args = commandArgs(TRUE))
+if (!(opt$format %in% c("fasta", "fastq"))) {
+    stop("wrong output format")
+}
+if (any(is.na(c(opt$fastqA, opt$fastqB, opt$fastqX, opt$fastqY)))) {
+    cat("\nInput ond output files must be specified!\n\n")
+    print_help(parser)
+    q()
+}
+
+
+if (!is.null(opt$cutadapt_options)) {
+    if (file.exists(opt$cutadapt_options)) {
+        cutadapt_arguments = scan(opt$cutadapt_options, what = "character", comment.char = "#")
+        
+    } else {
+        cutadapt_arguments = opt$cutadapt_options
+    }
+    ## remove 'EOF'
+    cutadapt_arguments = paste(" ", cutadapt_arguments, " ", collapse = " ", sep = " ")
+}
+
+
+
+cutadapt_cmd = function(input, output, cutadapt_arguments) {
+    cmds = paste("cutadapt --format=", "fastq", cutadapt_arguments, " ", c(input), 
+        " -o ", c(output), sep = "")
+}
+
+set.seed(opt$rnd_seed)
+PROP = opt$percent_above/100
+MINQ = opt$cut_off
+TRIM_END = opt$trim_end
+TRIM_START = opt$trim_start
+
+CHUNK_SIZE = opt$chunk_size
+
+MIN_LENGTH = 1 + TRIM_END - ifelse(is.na(TRIM_START), 1, TRIM_START)
+if (is.na(MIN_LENGTH)) {
+    MIN_LENGTH = 1
+}
+
+suppressPackageStartupMessages(library(ShortRead))
+Qfilter = srFilter(function(x) {
+    rowSums(as(quality(x), "matrix") > MINQ, na.rm = TRUE)/nchar(sread(x)) >= PROP
+})
+
+Min_Length_filter = srFilter(function(x) {
+    nchar(sread(x)) >= MIN_LENGTH
+})
+
+
+
+
+fin_filter = compose(Qfilter, nFilter(opt$Max_N), Min_Length_filter)
+
+if (opt$format == "fasta") {
+    writeFun = writeFasta
+} else {
+    writeFun = writeFastq
+}
+
+f1out = opt$fastqX
+f2out = opt$fastqY
+# are output files empty?:
+
+
+# find wich character specify pair: - it could be
+lA = readLines(opt$fastqA, n = 1)
+lB = readLines(opt$fastqB, n = 1)
+pair_tag_position = which(!strsplit(lA, "")[[1]] == strsplit(lB, "")[[1]])
+
+if ((length(pair_tag_position)) != 1 & !opt$rename) {
+    warning("unable to determine pair tag")
+}
+
+# pair_regex=paste(substring(lA,pair_tag_position-1,pair_tag_position-1),'.+',sep='')
+# assume 1/ same order 2/ all id must be same
+
+# check size of sequences - must be same...
+n1 = countLines(opt$fastqA)/4
+
+
+number_of_chunks = round(n1/CHUNK_SIZE)
+if (number_of_chunks == 0) {
+    CHUNK_SIZE = n1
+    number_of_chunks = 1
+}
+if (!is.null(opt$sample_size)) {
+    sample_size_in_chunk = opt$sample_size/number_of_chunks
+} else {
+    sample_size_in_chunk = CHUNK_SIZE
+}
+
+# adjust the chunk size to get exact count of sequences:
+CHUNK_SIZE = ceiling(n1/number_of_chunks)
+F_id = ifelse(opt$rename, "/1", "1")
+R_id = ifelse(opt$rename, "/2", "2")
+
+f1 <- FastqStreamer(opt$fastqA, CHUNK_SIZE)
+f2 <- FastqStreamer(opt$fastqB, CHUNK_SIZE)
+total = 0
+chunk = 0
+nucleotideFrequenciesForward = nucleotideFrequenciesReverse = matrix(0)
+while (TRUE) {
+    chunk = chunk + 1
+    fq1 <- yield(f1)
+    fq2 <- yield(f2)
+    if (length(fq1) == 0) {
+        break
+    }
+    ## rename
+    chunk_id = sprintf(paste0("%0", round(log10(number_of_chunks)) + 1, "d"), chunk)
+    cat("chunk id ", chunk_id, "\n")
+    fmt = paste0("%0", round(log10(length(fq1))) + 1, "d")
+    
+    ## either do not rename
+    if (opt$rename) {
+        ## or use index
+        fq1@id = fq2@id = BStringSet(paste0(chunk_id, sprintf(fmt, 1:length(fq1))))
+    } else {
+        ## or use largest common substring
+        ind = mapply(function(x, y) which(x != y)[1], strsplit(as.character(id(fq1)), 
+            split = ""), strsplit(as.character(id(fq2)), split = ""))
+        fq1@id = fq2@id = subseq(id(fq1), 1, ind - 1)
+        
+    }
+    
+    if (!is.na(TRIM_END)) {
+        fq1 = narrow(fq1, end = ifelse((TRIM_END) > width(fq1), width(fq1), TRIM_END))
+        fq2 = narrow(fq2, end = ifelse((TRIM_END) > width(fq2), width(fq2), TRIM_END))
+    }
+    if (!is.na(TRIM_START)) {
+        fq1 = narrow(fq1, start = ifelse((TRIM_START) < width(fq1), TRIM_START, width(fq1)))
+        fq2 = narrow(fq2, start = ifelse((TRIM_START) < width(fq2), TRIM_START, width(fq2)))
+        
+    }
+    
+    
+    fqF1 = fq1[fin_filter(fq1)]
+    fqF2 = fq2[fin_filter(fq2)]
+    
+    inc1 = id(fqF1) %in% id(fqF2)
+    inc2 = id(fqF2) %in% id(fqF1)
+    
+    cat("running cutadapt on chunk\n")
+    # cut addapt here: # filter parts:
+    tmp_in1 = tempfile(fileext = ".fastq")
+    tmp_out1 = tempfile(fileext = ".fastq")
+    tmp_in2 = tempfile(fileext = ".fastq")
+    tmp_out2 = tempfile(fileext = ".fastq")
+    
+    
+    if (is.null(formals(writeFastq)$compress)) {
+        writeFastq(fqF1[inc1], file = tmp_in1)
+        writeFastq(fqF2[inc2], file = tmp_in2)
+    } else {
+        writeFastq(fqF1[inc1], file = tmp_in1, compress = FALSE)
+        writeFastq(fqF2[inc2], file = tmp_in2, compress = FALSE)
+    }
+    
+    
+    cmd1 = cutadapt_cmd(tmp_in1, tmp_out1, cutadapt_arguments)
+    cmd2 = cutadapt_cmd(tmp_in2, tmp_out2, cutadapt_arguments)
+    # this should run cutadapt in parallel
+    
+    status = system(paste(cmd1, "& \n", cmd2, "&\nwait"), intern = TRUE)
+    # collect results
+    ftmp1 = FastqFile(tmp_out1)
+    
+    fqF1 = readFastq(ftmp1)
+    close(ftmp1)
+    
+    ftmp2 = FastqFile(tmp_out2)
+    fqF2 = readFastq(ftmp2)
+    close(ftmp2)
+    ## clean up
+    unlink(tmp_out1)
+    unlink(tmp_in1)
+    unlink(tmp_out2)
+    unlink(tmp_in2)
+    
+
+
+                                        # remove sequences similar to filter database (e.g. plastid DNA)
+    if (!is.null(opt$filter_seq)){
+      blast_results1 =  megablast(fqF1, database=opt$filter_seq)
+      blast_results2 =  megablast(fqF2, database=opt$filter_seq)
+      if (!is.null(blast_results1) &  !is.null(blast_results2)){
+        exclude1=with(blast_results1, unique(qseqid[pident >= 90 & qcovs >=90]))
+        exclude2=with(blast_results2, unique(qseqid[pident >= 90 & qcovs >=90]))
+        ## note - blast will truncate seq ids - anything beyond space is omitted
+        seq_to_exclude1 = gsub(" .*","",id(fqF1)) %in% exclude1
+        seq_to_exclude2 = gsub(" .*","",id(fqF2)) %in% exclude2
+        excluded_contamination1=signif(sum(seq_to_exclude1)/length(seq_to_exclude1)*100,3)
+        excluded_contamination2=signif(sum(seq_to_exclude2)/length(seq_to_exclude2)*100,3)
+        cat(excluded_contamination1,"% filtered out after blast - forward reads\n" )
+        cat(excluded_contamination2,"% filtered out after blast - reverse reads\n" )
+        fqF1 = fqF1[!seq_to_exclude1]
+        fqF2 = fqF2[!seq_to_exclude2]
+      }
+    }
+
+
+
+    
+    # filter complete pairs again: id1=gsub(pair_regex,'',id(fqF1))
+    # id2=gsub(pair_regex,'',id(fqF2))
+    inc1 = id(fqF1) %in% id(fqF2)
+    inc2 = id(fqF2) %in% id(fqF1)
+    total = sum(inc1) + total
+    ## create new id - last character must differentiate pair - for interlacig
+    
+    fqF1@id = BStringSet(paste0(id(fqF1), F_id))
+    fqF2@id = BStringSet(paste0(id(fqF2), R_id))
+    
+    
+    if (sum(inc1) > sample_size_in_chunk) {
+        smp = sort(sample(sum(inc1), sample_size_in_chunk))
+        writeFun(fqF1[inc1][smp], file = f1out, mode = "a")
+        writeFun(fqF2[inc2][smp], file = f2out, mode = "a")
+        nfrq1 = alphabetByCycle(sread(fqF1[inc1][smp]))
+        nfrq2 = alphabetByCycle(sread(fqF2[inc2][smp]))
+    } else {
+        writeFun(fqF1[inc1], file = f1out, mode = "a")
+        writeFun(fqF2[inc2], file = f2out, mode = "a")
+        nfrq1 = alphabetByCycle(sread(fqF1[inc1]))
+        nfrq2 = alphabetByCycle(sread(fqF2[inc2]))
+    }
+    nucleotideFrequenciesForward = matricesSum(nucleotideFrequenciesForward, nfrq1)
+    nucleotideFrequenciesReverse = matricesSum(nucleotideFrequenciesReverse, nfrq2)
+    
+}
+if( !is.na(opt$png_file_output)){
+    png(opt$png_file_output, width = 1000, height = 1000)
+    par(mfrow=c(2,1))
+    plotFrequencies(nucleotideFrequenciesForward)
+    mtext("forward reads")
+    plotFrequencies(nucleotideFrequenciesReverse)
+    mtext("reverse reads")
+    dev.off()
+}
+close(f1)
+close(f2)
+