comparison minfi_TCGA_pipeline.R @ 0:84361ce36a11 draft

planemo upload commit fb90aafc93e5e63acfcdac4c27cfd865cdf06c5a-dirty
author nturaga
date Tue, 19 Apr 2016 11:10:25 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:84361ce36a11
1 # setup R error handling to go to stderr
2 options(show.error.messages=F, error=function(){cat(geterrmessage(),file=stderr());q("no",1,F)})
3
4 # we need that to not crash galaxy with an UTF8 error on German LC settings.
5 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
6
7 library("getopt")
8 options(stringAsfactors = FALSE, useFancyQuotes = FALSE)
9 args <- commandArgs(trailingOnly = TRUE)
10
11 # get options, using the spec as defined by the enclosed list.
12 # we read the options from the default: commandArgs(TRUE).
13 spec <- matrix(c(
14 'quiet', 'q', 2, "logical",
15 'help' , 'h', 0, "logical",
16 'tarfile','f',1,"character",
17 "cores","c",1,"integer",
18 "b_permutations","b",2,"integer",
19 "smooth","m",2,"logical",
20 "l_value","l",2,"integer")
21 ,byrow=TRUE, ncol=4)
22 opt <- getopt(spec)
23
24 ## If help was asked for print a friendly message
25 ## and exit with a non-zero error code
26 if (!is.null(opt$help)) {
27 cat(getopt(spec, usage=TRUE))
28 q(status=1)
29 }
30
31
32 ## Set verbose mode
33 verbose = if(is.null(opt$quiet)){TRUE}else{FALSE}
34 if(verbose){
35 cat("Verbose mode is ON\n\n")
36 }
37
38 ## Load required libraries
39 suppressPackageStartupMessages({
40 library("minfi")
41 library("FlowSorted.Blood.450k")
42 library("TxDb.Hsapiens.UCSC.hg19.knownGene")
43 library("doParallel")
44 library("tools")
45 })
46
47
48 config_file = "tcga_temp/config.txt"
49 conf = read.csv(config_file,stringsAsFactors=FALSE,header=F)
50 tarfile_name = gsub(".+/","",conf$V2)
51 dataset_path = conf$V1
52
53 cmd = paste("ln -s",dataset_path,tarfile_name,sep=" ")
54 cat("Command : ", cmd,"\n")
55 system(cmd)
56
57 tarfile = tarfile_name
58 cat ("tarfile name: ",tarfile," file ext: ",file_ext(tarfile))
59 ## UNtar files in R first
60 if (file_ext(tarfile) == "tar"){
61 cat("Entering IF statment")
62 tar_contents = untar(tarfile,list=TRUE)
63 cat("regex failing here")
64 f = as.character(tar_contents[grep(".data.txt",fixed=TRUE,x=tar_contents)])
65 if (!is.null(f)){
66 cat("Untar being attempted")
67 untar(tarfile, exdir = ".",files=f)
68 cat("Untar succcess")
69 }
70 }
71 ## Move file from sub directory to main directory
72 from = list.files(pattern=".data.txt",recursive=TRUE)
73 to = gsub(".+/","",from)
74 rename_success = file.rename(from=from, to=to)
75
76
77 # This should pass only if steps have been successful
78 stopifnot(rename_success)
79 if (rename_success){
80 input_file = to
81 }
82
83 ### Read the TCGA data
84 GRset = readTCGA(input_file, sep = "\t", keyName = "Composite Element REF", Betaname = "Beta_value", pData = NULL, array = "IlluminaHumanMethylation450k",showProgress=TRUE)
85
86 ### Get beta values
87 beta = getBeta(GRset)
88 pd = pData(GRset)
89 CN = getCN(GRset)
90 chr = seqnames(GRset)
91 pos = start(GRset)
92 chr = as.character(chr)
93
94
95 # Assign phenotype information
96 ## Based on TCGA sample naming, TCGA-2E-A9G8-01A-11D-A409-05, char 14,15 represent
97 ## phenotypic status of sample, 01 = cancer, 11=normal
98 pd$status = ifelse(test= (substr(rownames(pd),14,15) == "01"),yes="cancer",no="normal")
99
100 ### DMP finder
101 dmp = dmpFinder(dat=beta,pheno=pd$status,type="categorical")
102 write.csv(dmp,file="dmps.csv",quote=FALSE,row.names=TRUE)
103 if(verbose){
104 cat("DMP Finder successful \n")
105 }
106
107
108 ## Make design matrix for bumphunting
109 #Model Matrix
110 T1="normal";T2="cancer"
111 ## Introduce error if levels are different
112 stopifnot(T1!=T2)
113 keep=pd$status%in%c(T1,T2)
114 tt=factor(pd$status[keep],c(T1,T2))
115 design=model.matrix(~tt)
116
117
118 ## Start bumphunter in a parallel environment
119 ## Parallelize over cores on machine
120 library(doParallel)
121 registerDoParallel(cores = opt$cores)
122
123 # Bumphunter Run with object processed with default Quantile Normalization
124 # provided along with TCGA data
125 dmrs = bumphunter(beta[,keep],chr=chr,pos=pos,design=design,B=opt$b_permutations,smooth=opt$smooth,pickCutoff =TRUE)
126 bumps = dmrs$tab
127
128 if(verbose){
129 cat("Bumphunter result", "\n")
130 head(bumps)
131 }
132
133
134 ### Choose DMR's of a certain length threshold.
135 ### This helps reduce the size of DMRs early, and match
136 ### with genes closest to region
137 bumps = bumps[bumps$L > opt$l_value,]
138 genes <- annotateTranscripts(TxDb.Hsapiens.UCSC.hg19.knownGene)
139 tab=matchGenes(bumps,genes)
140 annotated_dmrs=cbind(bumps,tab)
141
142 if(verbose){
143 cat("Match with annotation\n")
144 head(annotated_dmrs)
145 }
146
147 ## Save result, which contains DMR's and closest genes
148 write.csv(annotated_dmrs,file = "dmrs.csv",quote=FALSE,row.names=FALSE)
149
150 ## Garbage collect
151 gc()
152
153