Mercurial > repos > nturaga > minfi_analyze_tcga
diff minfi_pipeline.R @ 0:8b26eeb2da29 draft
planemo upload commit fb90aafc93e5e63acfcdac4c27cfd865cdf06c5a-dirty
author | nturaga |
---|---|
date | Tue, 19 Apr 2016 11:11:16 -0400 |
parents | |
children | 3cd8ea4f7079 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/minfi_pipeline.R Tue Apr 19 11:11:16 2016 -0400 @@ -0,0 +1,249 @@ +# setup R error handling to go to stderr +options(show.error.messages=F, error=function(){cat(geterrmessage(),file=stderr());q("no",1,F)}) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +library("getopt") +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs(trailingOnly = TRUE) + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + 'quiet', 'q', 2, "logical", + 'help' , 'h', 0, "logical", + "preprocess","p",1,"character", + "cores","c",1,"integer", + "numPositions","n",2,"integer", + "shrinkVar","s",2,"logical", + "b_permutations","b",2,"integer", + "smooth","m",2,"logical", + "cutoff","t",2,"double", + "l_value","l",2,"integer") + ,byrow=TRUE, ncol=4) +opt <- getopt(spec) + +# If help was asked for print a friendly message +# and exit with a non-zero error code +if (!is.null(opt$help)) { + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + + +## Set verbose mode +verbose = if(is.null(opt$quiet)){TRUE}else{FALSE} +if(verbose){ + cat("Verbose mode is ON\n\n") +} + +# Enforce the following required arguments +if (is.null(opt$preprocess)) { + cat("'--preprocess' is required\n") + q(status=1) +} +cat("verbose = ", opt$quiet,"\n") +cat("preprocess = ",opt$preprocess,"\n") +cat("cores = ", opt$cores, "\n") +cat("b_permutations = ",opt$b_permutations,"\n") +cat("smooth = ",opt$smooth,"\n") +cat("cutoff = ",opt$cutoff,"\n") +cat("l_value = ",opt$l_value,"\n") +cat("numPositions = ",opt$numPositions,"\n") +cat("shrinkVar = ",opt$shrinkVar,"\n") + + +# Load required libraries +suppressPackageStartupMessages({ + library("minfi") + library("FlowSorted.Blood.450k") + library("TxDb.Hsapiens.UCSC.hg19.knownGene") + library("doParallel") +}) + + +## Parse cheetah code and make dataframe for creating tmp dir +minfi_config_file = paste0("minfi_temp","/minfi_config.txt") +minfi_config = read.table(minfi_config_file) +colnames(minfi_config) = c("status","green","red","name") + + +## Make the tmpdir for symlinking data +base_dir = paste0("minfi_temp","/base") +system(paste0("mkdir ",base_dir)) + + +## Make symlinks of files +for (i in 1:nrow(minfi_config)){ + stopifnot(nrow(minfi_config) == nrow(minfi_config["name"])) + + ## Make green idat file symlinks + file_green = paste0(base_dir,"/",as.character(minfi_config[i,"name"]),"_Grn.idat") + cmd_green = paste("ln -s",as.character(minfi_config[i,"green"]),file_green,sep=" ") + cat("Reading file ",i,"GREEN Channel ", file_green) + system(cmd_green) + + ## Make red idat file symlinks + file_red = paste0(base_dir,"/",as.character(minfi_config[i,"name"]),"_Red.idat") + cmd_red = paste("ln -s",as.character(minfi_config[i,"red"]),file_red,sep=" ") + cat("Reading file ",i,"RED Channel ", file_red) + system(cmd_red) +} + +## Make dataframe with Basenames +Basename = paste0(base_dir,"/",unique(substr(list.files(base_dir),1,17))) +status = minfi_config[match(gsub(".+/","",Basename), minfi_config$name),"status"] +targets = data.frame(Basename=Basename,status=status) + +if ( verbose ) { + cat("Minfi targets file:\n\n ") + print(targets) +} + +## Read 450k files +RGset = read.450k.exp(targets=targets,verbose=T) + +if (verbose){ + cat("RGset has been read: \n\n") + print(RGset) +} + +pd = pData(RGset) + +## NOTE: QC report is for samples before normalization +## QC REPORT +files = gsub(".+/","",pd$filenames) +## Produce PDF file +if (!is.null(RGset)) { + # Make PDF of QC report + minfi::qcReport(rgSet=RGset,sampNames=files,sampGroups=pd$status,pdf="qc_report.pdf") +} + + +## MDS Plot +## Set phenotype data +files = gsub(".+/","",pd$filenames) +#numPositions=as.integer("${numPositions}") + +## Produce PDF file +if (!is.null(RGset)) { + ## Make PDF of MDS plot + pdf("mds_plot.pdf") + minfi::mdsPlot(dat=RGset,sampNames=files,sampGroups=pd$status,main="Beta MDS",numPositions = opt$numPositions,pch=19) + dev.off() +} + +if(verbose){ + cat("Made plot of QC report and MDS plot\n\n") +} + + +## Preprocess data with the normalization method chosen +if(opt$preprocess == "quantile"){ + normalized_RGset = preprocessQuantile(RGset) + if (verbose){cat("Preprocessed using Quantile normalization")}; +} else if (opt$preprocess == "funnorm"){ + normalized_RGset = preprocessFunnorm(RGset) + if(verbose){print("Preprocessed using Functional normalization")} +} else if (opt$preprocess == "noob"){ + normalized_RGset = preprocessNoob(RGset) + if (verbose){cat("Preprocessed using Noob normalization")}; +} else if (opt$preprocess == "illumina"){ + normalized_RGset = preprocessIllumina(RGset,bg.correct = TRUE, normalize = c("controls", "no"),reference = 1) + if(verbose){print("Preprocessed using Illumina normalization")} +} else if (opt$preprocess == "swan"){ + normalized_RGset = preprocessSWAN(RGset) + if(verbose){print("Preprocessed using SWAN normalization")} +}else { + normalized_RGset = RGset + if(verbose){print("Preprocessed using No normalization")} +} + + + +## Get beta values from Proprocessed data +beta = getBeta(normalized_RGset) +## Set phenotype data +pd = pData(normalized_RGset) + + +## DMP finder +dmp = dmpFinder(dat=beta,pheno=pd$status,type="categorical",shrinkVar=opt$shrinkVar) +write.csv(dmp,file="dmps.csv",quote=FALSE,row.names=TRUE) +if(verbose){ + cat("\n","DMP Finder successful \n") +} + + +# Model Matrix to pass into the bumphunter function +T1= levels(pd$status)[2] +T2= levels(pd$status)[1] + +# Introduce error if levels are different +stopifnot(T1!=T2) + +keep=pd$status%in%c(T1,T2) +tt=factor(pd$status[keep],c(T1,T2)) +design=model.matrix(~tt) + +if(verbose){ + cat("Model matrix is: \n") + design +} + +# Start bumphunter in a parallel environment +# Parallelize over cores on machine +registerDoParallel(cores = opt$cores) + +## Bumphunter Run with normalized_RGset processed with Quantile Normalization +if (is(normalized_RGset,"GenomicRatioSet")) { + res=bumphunter(normalized_RGset[,keep],design,B=opt$b_permutations,smooth=opt$smooth,cutoff= opt$cutoff,type="Beta") + bumps= res$tab +} else if(is(normalized_RGset,"MethylSet")) { + # convert MethylSet (norm.swan) into GenomicRatioSet through Ratioset + normalized_RGset = ratioConvert(normalized_RGset, what = "both", keepCN = TRUE) + normalized_RGset = mapToGenome(normalized_RGset) + res = bumphunter(normalized_RGset[,keep],design=design,B=opt$b_permutations,smooth=opt$smooth,cutoff=opt$cutoff) + bumps = res$tab +} else { + # This else case is never supposed to run, + # it will run only if the normalized_RGset object + # did not have the expected class type + cat("Bumphunter did not run properly","\n") + stopifnot(!is.null(bumps)) +} + +if(verbose){ + cat("Bumphunter result", "\n") + head(bumps) +} + + +## Choose DMR's of a certain length threshold. +## This helps reduce the size of DMRs early, and match +## with genes closest to region +bumps = bumps[bumps$L > opt$l_value,] +genes <- annotateTranscripts(TxDb.Hsapiens.UCSC.hg19.knownGene) +tab=matchGenes(bumps,genes) +annotated_dmrs=cbind(bumps,tab) + +if(verbose){ + cat("Match with annotation\n") + head(annotated_dmrs) +} + +# Save result, which contains DMR's and closest genes +write.csv(annotated_dmrs,file = "dmrs.csv",quote=FALSE,row.names=FALSE) + +# Garbage collect +gc() + + +## TODO: FIX BLOCK FINDER +# Block finder +#library(sva) +#pheno <- pData(GRset) +#mod <- model.matrix(~as.factor(status), data=pheno) +#mod0 <- model.matrix(~1, data=pheno) +#sva.results <- sva(mval, mod, mod0)