Mercurial > repos > kyost > atac_primer_tool
view find_qPCR_regions.R @ 4:72571a30f17b draft
Add test for unique peak names.
author | kyost |
---|---|
date | Wed, 02 May 2018 13:20:21 -0400 |
parents | fbfe7b785ea7 |
children | 3cd53127a838 |
line wrap: on
line source
## Command to run tool: # Rscript --vanilla find_qPCR_regions.R o.coverage.bed f9.coverage.bed lib_sizes.txt cor_cutoff cov_cutoff output_file # Set up R error handling to go to stderr options(show.error.messages=F, error=function(){cat(geterrmessage(),file=stderr());q("no",1,F)}) # Avoid crashing Galaxy with an UTF8 error on German LC settings loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") args <- commandArgs(TRUE) o.coverage <- args[1] f9.coverage <- args[2] lib_sizes <- args[3] corr_cutoff <- as.numeric(args[4]) cov_cutoff <- as.numeric(args[5]) output_file <- args[6] o.coverage.data <- read.delim(o.coverage, header=FALSE) f9.coverage.data <- read.delim(f9.coverage, header=FALSE) lib.sizes.data <- read.delim(lib_sizes, header=FALSE) m <- ncol(o.coverage.data) #normalize library sizes to 50 mil reads lib.sizes.data <- lib.sizes.data/50000000 #normalize peak reads and spanning reads by library size o.coverage.data[,5:m] <- t(apply(o.coverage.data[,5:m], 1, function(x) x/t(lib.sizes.data))) f9.coverage.data[,5:m] <- t(apply(f9.coverage.data[,5:m], 1, function(x) x/t(lib.sizes.data))) #returns list of dataframes containing windows for each peak split_coverage <- function(a) { returnlist <- list() temp <- a[1,] for (i in 2:nrow(a)) { if (temp[1,4] == strsplit(as.character(a[i,4]),"_window")[[1]][1]){ temp <- rbind(temp, a[i,]) }else{ returnlist[[length(returnlist)+1]] <- temp temp <- a[i,] } } returnlist[[length(returnlist)+1]] <- temp return(returnlist) } make_cor_plot <- function(a,b, corr_cut, cov_cut, output_file) { #a=o.coverage b=f9.coverage asplit <- split_coverage(a) bsplit <- split_coverage(b) combined_regions <- data.frame() for (i in 1:length(asplit)){ n <- nrow(asplit[[i]])-1 m <- ncol(asplit[[i]]) a_split <- asplit[[i]] b_split <- bsplit[[i]] if (sd(t(a_split[1,5:m])) == 0 || sd(t(b_split[-1,5:m])) == 0 ){ next } #calculate correlation between total peak reads and spanning reads in each window corb <- data.frame(cor(t(a_split[1,5:m]) , t(b_split[-1,5:m]) , method = "pearson")) data <- data.frame(t(corb), row.names = NULL, stringsAsFactors = FALSE) data <- cbind(rep(c(1:n)) , a_split[-1,1:4] , data) colnames(data)<-c("position","chr", "start", "stop", "name", "correlation") rownames(data)<-NULL a_split$average <- rowMeans(a_split[,5:m]) b_split$average <- rowMeans(b_split[,5:m]) data2 <- data.frame(cbind(c(1:n), a_split[-1,m+1]), stringsAsFactors=FALSE) data3 <- data.frame(cbind(c(1:n), b_split[-1,m+1]), stringsAsFactors=FALSE) colnames(data2)<-c("position","averageReadDepth") colnames(data3)<-c("position","averageReadDepth") #keep windows for which correlation with total peak reads is above threshold and #number of spanning reads is above cutoff regions <- data[which(data$correlation > corr_cut & data3$averageReadDepth > cov_cut),] regions <- regions[,2:4] #collapse windows to non-overlappin regions if (nrow(regions)>0){ regions$V4 <- paste(as.character(a_split[1,4]),"_region1",sep="") collapsed_regions <- regions[1,] nregion <- 1 if (nrow(regions)>=2){ for (j in 2:nrow(regions)) { if (as.numeric(regions[j,2]) <= (as.numeric(regions[j-1,3]))) { collapsed_regions[nrow(collapsed_regions),3] <- regions[j,3] }else{ collapsed_regions <- rbind(collapsed_regions, regions[j,]) nregion <- nregion + 1 collapsed_regions[nrow(collapsed_regions),4] <- paste(as.character(a_split[1,4]) ,"_region" , as.character(nregion) ,sep="") } } } rownames(collapsed_regions) <- NULL combined_regions <- rbind(combined_regions, collapsed_regions) } } write.table(combined_regions , output_file , sep = "\t" , col.names = FALSE , row.names = FALSE , quote = FALSE) print("Successfully found optimal ATAC-qPCR regions.") } make_cor_plot(o.coverage.data, f9.coverage.data, corr_cutoff, cov_cutoff, output_file)