Mercurial > repos > nturaga > minfi_pipeline
diff minfi_TCGA_pipeline.R @ 0:84361ce36a11 draft
planemo upload commit fb90aafc93e5e63acfcdac4c27cfd865cdf06c5a-dirty
author | nturaga |
---|---|
date | Tue, 19 Apr 2016 11:10:25 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/minfi_TCGA_pipeline.R Tue Apr 19 11:10:25 2016 -0400 @@ -0,0 +1,153 @@ +# setup R error handling to go to stderr +options(show.error.messages=F, error=function(){cat(geterrmessage(),file=stderr());q("no",1,F)}) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +library("getopt") +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs(trailingOnly = TRUE) + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + 'quiet', 'q', 2, "logical", + 'help' , 'h', 0, "logical", + 'tarfile','f',1,"character", + "cores","c",1,"integer", + "b_permutations","b",2,"integer", + "smooth","m",2,"logical", + "l_value","l",2,"integer") + ,byrow=TRUE, ncol=4) +opt <- getopt(spec) + +## If help was asked for print a friendly message +## and exit with a non-zero error code +if (!is.null(opt$help)) { + cat(getopt(spec, usage=TRUE)) + q(status=1) +} + + +## Set verbose mode +verbose = if(is.null(opt$quiet)){TRUE}else{FALSE} +if(verbose){ + cat("Verbose mode is ON\n\n") +} + +## Load required libraries +suppressPackageStartupMessages({ + library("minfi") + library("FlowSorted.Blood.450k") + library("TxDb.Hsapiens.UCSC.hg19.knownGene") + library("doParallel") + library("tools") +}) + + +config_file = "tcga_temp/config.txt" +conf = read.csv(config_file,stringsAsFactors=FALSE,header=F) +tarfile_name = gsub(".+/","",conf$V2) +dataset_path = conf$V1 + +cmd = paste("ln -s",dataset_path,tarfile_name,sep=" ") +cat("Command : ", cmd,"\n") +system(cmd) + +tarfile = tarfile_name +cat ("tarfile name: ",tarfile," file ext: ",file_ext(tarfile)) +## UNtar files in R first +if (file_ext(tarfile) == "tar"){ + cat("Entering IF statment") + tar_contents = untar(tarfile,list=TRUE) + cat("regex failing here") + f = as.character(tar_contents[grep(".data.txt",fixed=TRUE,x=tar_contents)]) + if (!is.null(f)){ + cat("Untar being attempted") + untar(tarfile, exdir = ".",files=f) + cat("Untar succcess") + } +} +## Move file from sub directory to main directory +from = list.files(pattern=".data.txt",recursive=TRUE) +to = gsub(".+/","",from) +rename_success = file.rename(from=from, to=to) + + +# This should pass only if steps have been successful +stopifnot(rename_success) +if (rename_success){ + input_file = to +} + +### Read the TCGA data +GRset = readTCGA(input_file, sep = "\t", keyName = "Composite Element REF", Betaname = "Beta_value", pData = NULL, array = "IlluminaHumanMethylation450k",showProgress=TRUE) + +### Get beta values +beta = getBeta(GRset) +pd = pData(GRset) +CN = getCN(GRset) +chr = seqnames(GRset) +pos = start(GRset) +chr = as.character(chr) + + +# Assign phenotype information +## Based on TCGA sample naming, TCGA-2E-A9G8-01A-11D-A409-05, char 14,15 represent +## phenotypic status of sample, 01 = cancer, 11=normal +pd$status = ifelse(test= (substr(rownames(pd),14,15) == "01"),yes="cancer",no="normal") + +### DMP finder +dmp = dmpFinder(dat=beta,pheno=pd$status,type="categorical") +write.csv(dmp,file="dmps.csv",quote=FALSE,row.names=TRUE) +if(verbose){ + cat("DMP Finder successful \n") +} + + +## Make design matrix for bumphunting +#Model Matrix +T1="normal";T2="cancer" +## Introduce error if levels are different +stopifnot(T1!=T2) +keep=pd$status%in%c(T1,T2) +tt=factor(pd$status[keep],c(T1,T2)) +design=model.matrix(~tt) + + +## Start bumphunter in a parallel environment +## Parallelize over cores on machine +library(doParallel) +registerDoParallel(cores = opt$cores) + +# Bumphunter Run with object processed with default Quantile Normalization +# provided along with TCGA data +dmrs = bumphunter(beta[,keep],chr=chr,pos=pos,design=design,B=opt$b_permutations,smooth=opt$smooth,pickCutoff =TRUE) +bumps = dmrs$tab + +if(verbose){ + cat("Bumphunter result", "\n") + head(bumps) +} + + +### Choose DMR's of a certain length threshold. +### This helps reduce the size of DMRs early, and match +### with genes closest to region +bumps = bumps[bumps$L > opt$l_value,] +genes <- annotateTranscripts(TxDb.Hsapiens.UCSC.hg19.knownGene) +tab=matchGenes(bumps,genes) +annotated_dmrs=cbind(bumps,tab) + +if(verbose){ + cat("Match with annotation\n") + head(annotated_dmrs) +} + +## Save result, which contains DMR's and closest genes +write.csv(annotated_dmrs,file = "dmrs.csv",quote=FALSE,row.names=FALSE) + +## Garbage collect +gc() + +