comparison run_spp.R @ 2:86146a313b66 draft

Uploaded
author modencode-dcc
date Thu, 17 Jan 2013 14:36:52 -0500
parents
children 495a6d033ca1
comparison
equal deleted inserted replaced
1:6ed68e907ff6 2:86146a313b66
1 # run_spp.R
2 # =============
3 # Author: Anshul Kundaje, Computer Science Dept., Stanford University
4 # Email: akundaje@stanford.edu
5 # Last updated: Oct 8, 2010
6 # =============
7 # MANDATORY ARGUMENTS
8 # -c=<ChIP_tagAlign/BAMFile>, full path and name of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)
9 # MANDATORY ARGUMENT FOR PEAK CALLING
10 # -i=<Input_tagAlign/BAMFile>, full path and name of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)
11 # OPTIONAL ARGUMENTS
12 # -s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-100:5:600
13 # -speak=<strPeak>, user-defined cross-correlation peak strandshift
14 # -x=<min>:<max>, strand shifts to exclude (This is mainly to avoid phantom peaks) default=10:(readlen+10)
15 # -p=<nodes> , number of parallel processing nodes, default=NULL
16 # -fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling
17 # -npeak=<numPeaks>, threshold on number of peaks to call
18 # -tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used)
19 # -filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name
20 # OUTPUT PARAMETERS
21 # -odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used)
22 # -savn=<narrowpeakfilename> OR -savn NarrowPeak file name
23 # -savr=<regionpeakfilename> OR -savr RegionPeak file name
24 # -savd=<rdatafile> OR -savd , save Rdata file
25 # -savp=<plotdatafile> OR -savp , save cross-correlation plot
26 # -out=<resultfile>, append peakshift result to a file
27 # format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>phantomPeakCoef<tab>relPhantomPeakCoef<tab>QualityTag
28 # -rf , if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists
29 # -clean, if present will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files
30
31 args <- commandArgs(trailingOnly=TRUE); # Read Arguments from command line
32 nargs = length(args); # number of arguments
33
34 # ###########################################################################
35 # AUXILIARY FUNCTIONS
36 # ###########################################################################
37
38 print.usage <- function() {
39 # ===================================
40 # Function will print function usage
41 # ===================================
42 cat('Usage: Rscript run_spp.R <options>\n',file=stderr())
43 cat('MANDATORY ARGUMENTS\n',file=stderr())
44 cat('-c=<ChIP_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped)(FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) \n',file=stderr())
45 cat('MANDATORY ARGUMENTS FOR PEAK CALLING\n',file=stderr())
46 cat('-i=<Input_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) \n',file=stderr())
47 cat('OPTIONAL ARGUMENTS\n',file=stderr())
48 cat('-s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-100:5:600\n',file=stderr())
49 cat('-speak=<strPeak>, user-defined cross-correlation peak strandshift\n',file=stderr())
50 cat('-x=<min>:<max>, strand shifts to exclude (This is mainly to avoid region around phantom peak) default=10:(readlen+10)\n',file=stderr())
51 cat('-p=<nodes> , number of parallel processing nodes, default=0\n',file=stderr())
52 cat('-fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling\n',file=stderr())
53 cat('-npeak=<numPeaks>, threshold on number of peaks to call\n',file=stderr())
54 cat('-tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used)\n',file=stderr())
55 cat('-filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name\n',file=stderr())
56 cat('OUTPUT ARGUMENTS\n',file=stderr())
57 cat('-odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used)\n',file=stderr())
58 cat('-savn=<narrowpeakfilename> OR -savn NarrowPeak file name (fixed width peaks)\n',file=stderr())
59 cat('-savr=<regionpeakfilename> OR -savr RegionPeak file name (variable width peaks with regions of enrichment)\n',file=stderr())
60 cat('-savd=<rdatafile> OR -savd, save Rdata file\n',file=stderr())
61 cat('-savp=<plotdatafile> OR -savp, save cross-correlation plot\n',file=stderr())
62 cat('-out=<resultfile>, append peakshift/phantomPeak results to a file\n',file=stderr())
63 cat(' format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>phantomPeakCoef<tab>relPhantomPeakCoef<tab>QualityTag)\n',file=stderr())
64 cat('-rf, if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists\n',file=stderr())
65 cat('-clean, if present will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files\n',file=stderr())
66 } # end: print.usage()
67
68 get.file.parts <- function(file.fullpath) {
69 # ===================================
70 # Function will take a file name with path and split the file name into
71 # path, fullname, name and ext
72 # ===================================
73 if (! is.character(file.fullpath)) {
74 stop('File name must be a string')
75 }
76
77 file.parts <- strsplit(as.character(file.fullpath), .Platform$file.sep, fixed=TRUE)[[1]] # split on file separator
78
79 if (length(file.parts) == 0) { # if empty file name
80 return(list(path='',
81 fullname='',
82 name='',
83 ext='')
84 )
85 } else {
86 if (length(file.parts) == 1) { # if no path then just the file name itself
87 file.path <- '.'
88 file.fullname <- file.parts
89 } else {
90 file.path <- paste(file.parts[1:(length(file.parts)-1)], collapse=.Platform$file.sep) # 1:last-1 token is path
91 file.fullname <- file.parts[length(file.parts)] # last token is filename
92 }
93 file.fullname.parts <- strsplit(file.fullname,'.',fixed=TRUE)[[1]] # split on .
94 if (length(file.fullname.parts) == 1) { # if no extension
95 file.ext <- ''
96 file.name <- file.fullname.parts
97 } else {
98 file.ext <- paste('.', file.fullname.parts[length(file.fullname.parts)], sep="") # add the . to the last token
99 file.name <- paste(file.fullname.parts[1:(length(file.fullname.parts)-1)], collapse=".")
100 }
101 return(list(path=file.path,
102 fullname=file.fullname,
103 name=file.name,
104 ext=file.ext))
105 }
106 } # end: get.file.parts()
107
108 parse.arguments <- function(args) {
109 # ===================================
110 # Function will parse arguments
111 # ===================================
112 # Set arguments to default values
113 chip.file <- NA # main ChIP tagAlign/BAM file name
114 isurl.chip.file <- FALSE # flag indicating whether ChIP file is a URL
115 control.file <- NA # control tagAlign/BAM file name
116 isurl.control.file <- FALSE # flag indicating whether control file is a URL
117 sep.min <- -100 # min strand shift
118 sep.max <- 600 # max strand shift
119 sep.bin <- 5 # increment for strand shift
120 sep.peak <- NA # user-defined peak shift
121 exclude.min <- 10 # lowerbound of strand shift exclusion region
122 exclude.max <- NaN # upperbound of strand shift exclusion region
123 n.nodes <- NA # number of parallel processing nodes
124 fdr <- 0.01 # false discovery rate threshold for peak calling
125 npeak <- NA # threshold on number of peaks to call
126 temp.dir <- tempdir() # temporary directory
127 chrname.rm.pattern <- NA # chromosome name pattern used to remove tags
128 output.odir <- NA # Output directory name
129 output.npeak.file <- NA # Output narrowPeak file name
130 output.rpeak.file <- NA # Output regionPeak file name
131 output.rdata.file <- NA # Rdata file
132 output.plot.file <- NA # cross correlation plot file
133 output.result.file <- NA # result file
134 replace.flag <- FALSE # replace file flag
135 clean.files.flag <- FALSE # file deletion flag
136
137 # Parse arguments
138 for (each.arg in args) {
139
140 if (grepl('^-c=',each.arg)) { #-c=<chip.file>
141
142 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
143 if (! is.na(arg.split[2]) ) {
144 chip.file <- arg.split[2] # second part is chip.file
145 } else {
146 stop('No tagAlign/BAM file name provided for parameter -c=')
147 }
148
149 } else if (grepl('^-i=',each.arg)) { #-i=<control.file>
150
151 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
152 if (! is.na(arg.split[2]) ) {
153 control.file <- arg.split[2] # second part is control.file
154 } else {
155 stop('No tagAlign/BAM file name provided for parameter -i=')
156 }
157
158 } else if (grepl('^-s=',each.arg)) { #-s=<sep.min>:<sep.bin>:<sep.max>
159
160 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
161 if (! is.na(arg.split[2]) ) {
162 sep.vals <- arg.split[2] # second part is sepmin:sepbin:sepmax
163 sep.vals.split <- strsplit(sep.vals,':',fixed=TRUE)[[1]] # split on :
164 if (length(sep.vals.split) != 3) { # must have 3 parts
165 stop('Strand shift limits must be specified as -s=sepmin:sepbin:sepmax')
166 } else {
167 if (any(is.na(as.numeric(sep.vals.split)))) { # check that sep vals are numeric
168 stop('Strand shift limits must be numeric values')
169 }
170 sep.min <- round(as.numeric(sep.vals.split[1]))
171 sep.bin <- round(as.numeric(sep.vals.split[2]))
172 sep.max <- round(as.numeric(sep.vals.split[3]))
173 if ((sep.min > sep.max) || (sep.bin > (sep.max - sep.min)) || (sep.bin < 0)) {
174 stop('Illegal separation values -s=sepmin:sepbin:sepmax')
175 }
176 }
177 } else {
178 stop('Strand shift limits must be specified as -s=sepmin:sepbin:sepmax')
179 }
180
181 } else if (grepl('^-speak=',each.arg)) { #-speak=<sep.peak> , user-defined cross-correlation peak strandshift
182
183 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
184 if (! is.na(arg.split[2]) ) {
185 sep.peak <- arg.split[2] # second part is <sep.peak>
186 if (is.na(as.numeric(sep.peak))) { # check that sep.peak is numeric
187 stop('-speak=<sep.peak>: User defined peak shift must be numeric')
188 }
189 sep.peak <- as.numeric(sep.peak)
190 } else {
191 stop('User defined peak shift must be provided as -speak=<sep.peak>')
192 }
193
194 } else if (grepl('^-x=',each.arg)) { #-x=<exclude.min>:<exclude.max>
195
196 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
197 if (! is.na(arg.split[2]) ) {
198 exclude.vals <- arg.split[2] # second part is excludemin:excludemax
199 exclude.vals.split <- strsplit(exclude.vals,':',fixed=TRUE)[[1]] # split on :
200 if (length(exclude.vals.split) != 2) { # must have 2 parts
201 stop('Exclusion limits must be specified as -x=excludemin:excludemax')
202 } else {
203 if (any(is.na(as.numeric(exclude.vals.split)))) { # check that exclude vals are numeric
204 stop('Exclusion limits must be numeric values')
205 }
206 exclude.min <- round(as.numeric(exclude.vals.split[1]))
207 exclude.max <- round(as.numeric(exclude.vals.split[2]))
208 if (exclude.min > exclude.max) {
209 stop('Illegal exclusion limits -x=excludemin:excludemax')
210 }
211 }
212 } else {
213 stop('Exclusion limits must be specified as -x=excludemin:excludemax')
214 }
215
216 } else if (grepl('^-p=',each.arg)) { #-p=<n.nodes> , number of parallel processing nodes, default=NULL
217
218 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
219 if (! is.na(arg.split[2]) ) {
220 n.nodes <- arg.split[2] # second part is numnodes
221 if (is.na(as.numeric(n.nodes))) { # check that n.nodes is numeric
222 stop('-p=<numnodes>: numnodes must be numeric')
223 }
224 n.nodes <- round(as.numeric(n.nodes))
225 } else {
226 stop('Number of parallel nodes must be provided as -p=<numnodes>')
227 }
228
229 } else if (grepl('^-fdr=',each.arg)) { #-fdr=<fdr> , false discovery rate, default=0.01
230
231 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
232 if (! is.na(arg.split[2]) ) {
233 fdr <- arg.split[2] # second part is fdr
234 if (is.na(as.numeric(fdr))) { # check that fdr is numeric
235 stop('-fdr=<falseDiscoveryRate>: false discovery rate must be numeric')
236 }
237 fdr <- as.numeric(fdr)
238 } else {
239 stop('False discovery rate must be provided as -fdr=<fdr>')
240 }
241
242 } else if (grepl('^-npeak=',each.arg)) { #-npeak=<numPeaks> , number of peaks threshold, default=NA
243
244 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
245 if (! is.na(arg.split[2]) ) {
246 npeak <- arg.split[2] # second part is npeak
247 if (is.na(as.numeric(npeak))) { # check that npeak is numeric
248 stop('-npeak=<numPeaks>: threshold on number of peaks must be numeric')
249 }
250 npeak <- round(as.numeric(npeak))
251 } else {
252 stop('Threshold on number of peaks must be provided as -npeak=<numPeaks>')
253 }
254
255 } else if (grepl('^-tmpdir=',each.arg)) { #-tmpdir=<temp.dir>
256
257 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
258 if (! is.na(arg.split[2]) ) {
259 temp.dir <- arg.split[2] # second part is temp.dir
260 } else {
261 stop('No temporary directory provided for parameter -tmpdir=')
262 }
263
264 } else if (grepl('^-filtchr=',each.arg)) { #-filtchr=<chrname.rm.pattern>
265
266 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
267 if (! is.na(arg.split[2]) ) {
268 chrname.rm.pattern <- arg.split[2] # second part is chrname.rm.pattern
269 } else {
270 stop('No pattern provided for parameter -filtchr=')
271 }
272
273 } else if (grepl('^-odir=',each.arg)) { #-odir=<output.odir>
274
275 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
276 if (! is.na(arg.split[2]) ) {
277 output.odir <- arg.split[2] # second part is output.odir
278 } else {
279 stop('No output directory provided for parameter -odir=')
280 }
281
282 } else if (grepl('^-savn',each.arg)) { # -savn=<output.npeak.file> OR -savn , save narrowpeak
283
284 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
285 if (! is.na(arg.split[2])) {
286 output.npeak.file <- arg.split[2] #-savn=
287 } else if (each.arg=='-savn') {
288 output.npeak.file <- NULL # NULL indicates get the name from the main file name
289 } else {
290 stop('Argument for saving narrowPeak file must be -savn or -savn=<filename>')
291 }
292
293 } else if (grepl('^-savr',each.arg)) { # -savr=<output.rpeak.file> OR -savr , save regionpeak
294
295 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
296 if (! is.na(arg.split[2])) {
297 output.rpeak.file <- arg.split[2] #-savr=
298 } else if (each.arg=='-savr') {
299 output.rpeak.file <- NULL # NULL indicates get the name from the main file name
300 } else {
301 stop('Argument for saving regionPeak file must be -savr or -savr=<filename>')
302 }
303
304 } else if (grepl('^-savd',each.arg)) { # -savd=<output.rdata.file> OR -savd , save Rdata file
305
306 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
307 if (! is.na(arg.split[2])) {
308 output.rdata.file <- arg.split[2] #-savd=
309 } else if (each.arg=='-savd') {
310 output.rdata.file <- NULL # NULL indicates get the name from the main file name
311 } else {
312 stop('Argument for saving Rdata file must be -savd or -savd=<filename>')
313 }
314
315 } else if (grepl('^-savp',each.arg)) { # -savp=<output.plot.file> OR -savp , save cross-correlation plot
316
317 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
318 if (! is.na(arg.split[2])) {
319 output.plot.file <- arg.split[2] #-savp=
320 } else if (each.arg=='-savp') {
321 output.plot.file <- NULL # NULL indicates get the name from the main file name
322 } else {
323 stop('Argument for saving Rdata file must be -savp or -savp=<filename>')
324 }
325
326 } else if (grepl('^-out=',each.arg)) { #-out=<output.result.file>
327
328 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
329 if (! is.na(arg.split[2]) ) {
330 output.result.file <- arg.split[2] # second part is output.result.file
331 } else {
332 stop('No result file provided for parameter -out=')
333 }
334
335 } else if (each.arg == '-rf') {
336
337 replace.flag <- TRUE
338
339 } else if (each.arg == '-clean') {
340
341 clean.files.flag <- TRUE
342
343 } else {
344
345 stop('Illegal argument ',each.arg)
346 }
347 }
348 # End: for loop
349
350 # Check mandatory arguments
351 if (is.na(chip.file)) {
352 stop('-c=<tagAlign/BAMFileName> is a mandatory argument')
353 }
354
355 if (is.na(control.file) && ! is.na(output.npeak.file)) {
356 stop('-i=<tagAlign/BAMFileName> is required for peak calling')
357 }
358
359 # Check if ChIP and control files are URLs
360 if (grepl('^http://',chip.file)) {
361 isurl.chip.file <- TRUE
362 }
363 if (grepl('^http://',control.file)) {
364 isurl.control.file <- TRUE
365 }
366
367 # If ChIP file is a URL output.odir MUST be specified
368 if (isurl.chip.file && is.na(output.odir)) {
369 stop('If ChIP file is a URL, then output directory MUST be specified')
370 }
371
372 # Check that ChIP and control files exist
373 if (isurl.chip.file) {
374 if (system(paste('wget -q --spider',chip.file)) != 0) {
375 stop('ChIP file URL not valid: ',chip.file)
376 }
377 } else if (!file.exists(chip.file)) {
378 stop('ChIP File:',chip.file,' does not exist')
379 }
380
381 if (!is.na(control.file)) {
382 if (isurl.control.file) {
383 if (system(paste('wget -q --spider',control.file)) != 0) {
384 stop('Control file URL not valid: ',control.file)
385 }
386 } else if (!file.exists(control.file)) {
387 stop('Control File:',control.file,' does not exist')
388 }
389 }
390
391 # Correct other arguments
392 if (is.na(output.odir)) { # Reconstruct output.odir if not provided
393 output.odir <- get.file.parts(chip.file)$path
394 }
395
396 if (is.null(output.npeak.file)) { # Reconstruct output.npeak.file if NULL
397 output.npeak.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '_VS_', get.file.parts(control.file)$name,'.narrowPeak', sep=""))
398 }
399
400 if (is.null(output.rpeak.file)) { # Reconstruct output.rpeak.file if NULL
401 output.rpeak.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '_VS_', get.file.parts(control.file)$name,'.regionPeak', sep=""))
402 }
403
404 if (is.null(output.rdata.file)) { # Reconstruct output.rdata.file if NULL
405 output.rdata.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '.Rdata', sep=""))
406 }
407
408 if (is.null(output.plot.file)) { # Reconstruct output.plot.file if NULL
409 output.plot.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '.pdf', sep=""))
410 }
411
412 return(list(chip.file=chip.file,
413 isurl.chip.file=isurl.chip.file,
414 control.file=control.file,
415 isurl.control.file=isurl.control.file,
416 sep.range=c(sep.min,sep.bin,sep.max),
417 sep.peak=sep.peak,
418 ex.range=c(exclude.min,exclude.max),
419 n.nodes=n.nodes,
420 fdr=fdr,
421 npeak=npeak,
422 temp.dir=temp.dir,
423 chrname.rm.pattern=chrname.rm.pattern,
424 output.odir=output.odir,
425 output.npeak.file=output.npeak.file,
426 output.rpeak.file=output.rpeak.file,
427 output.rdata.file=output.rdata.file,
428 output.plot.file=output.plot.file,
429 output.result.file=output.result.file,
430 replace.flag=replace.flag,
431 clean.files.flag=clean.files.flag))
432 } # end: parse.arguments()
433
434 read.align <- function(align.filename) {
435 # ===================================
436 # Function will read a tagAlign or BAM file
437 # ===================================
438 if (grepl('(\\.bam)?.*(\\.tagAlign)',align.filename)) { # if tagalign file
439 chip.data <- read.tagalign.tags(align.filename)
440 # get readlength info
441 tmpDataRows <- read.table(align.filename,nrows=500)
442 chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2))
443 } else if (grepl('(\\.tagAlign)?.*(\\.bam)',align.filename)) { # if bam file
444 # create BAM file name
445 bam2align.filename <- sub('\\.bam','.tagAlign',align.filename)
446 # generate command to convert bam to tagalign
447 command <- vector(length=2)
448 command[1] <- sprintf("samtools view -F 0x0204 -o - %s",align.filename)
449 command[2] <- paste("awk 'BEGIN{FS=" , '"\t"' , ";OFS=", '"\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}', "' 1> ", bam2align.filename, sep="")
450 # command[2] <- paste("awk 'BEGIN{OFS=", '"\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}', "' 1> ", bam2align.filename, sep="")
451 command <- paste(command,collapse=" | ")
452 # Run command
453 status <- system(command,intern=FALSE,ignore.stderr=FALSE)
454 if ((status != 0) || !file.exists(bam2align.filename)) {
455 cat(sprintf("Error converting BAM to tagalign file: %s\n",align.filename),file=stderr())
456 q(save="no",status=1)
457 }
458 # read converted BAM file
459 chip.data <- read.tagalign.tags(bam2align.filename)
460 # get readlength info
461 tmpDataRows <- read.table(bam2align.filename,nrows=500)
462 chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2))
463 # delete temporary tagalign file
464 file.remove(bam2align.filename)
465 } else {
466 cat(sprintf("Error:Unknown file format for file:%s\n",align.fname),file=stderr())
467 q(save="no",status=1)
468 }
469 return(chip.data)
470 } # end: read.align()
471
472 print.run.params <- function(params){
473 # ===================================
474 # Output run parameters
475 # ===================================
476 cat('################\n',file=stdout())
477 cat(iparams$chip.file,
478 iparams$control.file,
479 iparams$sep.range,
480 iparams$sep.peak,
481 iparams$ex.range,
482 iparams$n.nodes,
483 iparams$fdr,
484 iparams$npeak,
485 iparams$output.odir,
486 iparams$output.npeak.file,
487 iparams$output.rpeak.file,
488 iparams$output.rdata.file,
489 iparams$output.plot.file,
490 iparams$output.result.file,
491 iparams$replace.flag,
492 labels=c('ChIP data:','Control data:', 'strandshift(min):','strandshift(step):','strandshift(max)','user-defined peak shift',
493 'exclusion(min):','exclusion(max):','num parallel nodes:','FDR threshold:','NumPeaks Threshold:','Output Directory:',
494 'narrowPeak output file name:', 'regionPeak output file name:', 'Rdata filename:',
495 'plot pdf filename:','result filename:','Overwrite files?:'),
496 fill=18,
497 file=stdout())
498 cat('\n',file=stdout())
499 } # end: print.run.parameters()
500
501 check.replace.flag <- function(params){
502 # ===================================
503 # Check if files exist
504 # ===================================
505 # If replace.flag is NOT set, check if output files exist and abort if necessary
506 if (! iparams$replace.flag) {
507 if (! is.na(iparams$output.npeak.file)) {
508 if (file.exists(iparams$output.npeak.file)) {
509 cat('narrowPeak file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
510 q(save="no",status=1)
511 }
512 }
513 if (! is.na(iparams$output.rpeak.file)) {
514 if (file.exists(iparams$output.rpeak.file)) {
515 cat('regionPeak file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
516 q(save="no",status=1)
517 }
518 }
519 if (! is.na(iparams$output.plot.file)) {
520 if (file.exists(iparams$output.plot.file)) {
521 cat('Plot file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
522 q(save="no",status=1)
523 }
524 }
525 if (! is.na(iparams$output.rdata.file)) {
526 if (file.exists(iparams$output.rdata.file)) {
527 cat('Rdata file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
528 q(save="no",status=1)
529 }
530 }
531 }
532 }
533
534 # #############################################################################
535 # MAIN FUNCTION
536 # #############################################################################
537
538 # Check number of arguments
539 minargs = 1;
540 maxargs = 17;
541 if (nargs < minargs | nargs > maxargs) {
542 print.usage()
543 q(save="no",status=1)
544 }
545
546 # Parse arguments
547 # iparams$chip.file
548 # iparams$isurl.chip.file
549 # iparams$control.file
550 # iparams$isurl.control.file
551 # iparams$sep.range
552 # iparams$sep.peak
553 # iparams$ex.range
554 # iparams$n.nodes
555 # iparams$fdr
556 # iparams$npeak
557 # iparams$temp.dir
558 # iparams$output.odir
559 # iparams$output.npeak.file
560 # iparams$output.rpeak.file
561 # iparams$output.rdata.file
562 # iparams$output.plot.file
563 # iparams$output.result.file
564 # iparams$replace.flag
565 # iparams$clean.files.flag
566 iparams <- parse.arguments(args)
567
568 # Print run parameters
569 print.run.params(iparams)
570
571 # Check if output files exist
572 check.replace.flag(iparams)
573
574 # curr.chip.file and curr.control.file always point to the original ChIP and control files on disk
575 # ta.chip.filename & ta.control.filename always point to the final but temporary versions of the ChIP and control files that will be passed to read.align
576
577 # Download ChIP and control files if necessary to temp.dir
578 if (iparams$isurl.chip.file) {
579 curr.chip.file <- file.path(iparams$temp.dir, get.file.parts(iparams$chip.file)$fullname) # file is downloaded to temp.dir. Has same name as URL suffix
580 cat('Downloading ChIP file:',iparams$chip.file,"\n",file=stdout())
581 if (system(paste('wget -N -q -P',iparams$temp.dir,iparams$chip.file)) != 0) {
582 stop('Error downloading ChIP file:',iparams$chip.file)
583 }
584 } else {
585 curr.chip.file <- iparams$chip.file # file is in original directory
586 }
587
588 if (iparams$isurl.control.file) {
589 curr.control.file <- file.path(iparams$temp.dir, get.file.parts(iparams$control.file)$fullname) # file is downloaded to temp.dir. Has same name as URL suffix
590 cat('Downloading control file:',iparams$control.file,"\n",file=stdout())
591 if (system(paste('wget -N -q -P',iparams$temp.dir,iparams$control.file)) != 0) {
592 stop('Error downloading Control file:',iparams$control.file)
593 }
594 } else {
595 curr.control.file <- iparams$control.file # file is in original directory
596 }
597
598 # unzip ChIP and input files if required AND copy to temp directory
599 if (get.file.parts(curr.chip.file)$ext == '.gz') {
600 ta.chip.filename <- tempfile(get.file.parts(curr.chip.file)$name, tmpdir=iparams$temp.dir) # unzip file to temp.dir/[filename with .gz removed][randsuffix]
601 cat('Decompressing ChIP file\n',file=stdout())
602 if (system(paste("gunzip -c",curr.chip.file,">",ta.chip.filename)) != 0) {
603 stop('Unable to decompress file:', iparams$chip.file)
604 }
605 if (iparams$clean.files.flag) { # Remove original file if clean.files.flag is set
606 file.remove(curr.chip.file)
607 }
608 } else {
609 ta.chip.filename <- tempfile(get.file.parts(curr.chip.file)$fullname, tmpdir=iparams$temp.dir)
610 if (iparams$clean.files.flag) {
611 file.rename(curr.chip.file,ta.chip.filename) # move file to temp.dir/[filename][randsuffix]
612 } else {
613 file.copy(curr.chip.file,ta.chip.filename) # copy file to temp.dir/[filename][randsuffix]
614 }
615 }
616
617 if (! is.na(iparams$control.file)) {
618 if (get.file.parts(curr.control.file)$ext == '.gz') {
619 ta.control.filename <- tempfile(get.file.parts(curr.control.file)$name, tmpdir=iparams$temp.dir) # unzip file to temp.dir/[filename with .gz removed][randsuffix]
620 cat('Decompressing control file\n',file=stdout())
621 if (system(paste("gunzip -c",curr.control.file,">",ta.control.filename)) != 0) {
622 stop('Unable to decompress file:', iparams$control.file)
623 }
624 if (iparams$clean.files.flag) { # Remove original file if clean.files.flag is set
625 file.remove(curr.control.file)
626 }
627 } else {
628 ta.control.filename <- tempfile(get.file.parts(curr.control.file)$fullname, tmpdir=iparams$temp.dir) # copy file to temp.dir/[filename][randsuffix]
629
630 if (iparams$clean.files.flag) {
631 file.rename(curr.control.file,ta.control.filename) # move file to temp.dir/[filename][randsuffix]
632 } else {
633 file.copy(curr.control.file,ta.control.filename) # copy file to temp.dir/[filename][randsuffix]
634 }
635 }
636 }
637
638 # Remove downloaded files
639 if (iparams$isurl.chip.file & file.exists(curr.chip.file)) {
640 file.remove(curr.chip.file)
641 }
642
643 if (! is.na(iparams$control.file)) {
644 if (iparams$isurl.control.file & file.exists(curr.control.file)) {
645 file.remove(curr.control.file)
646 }
647 }
648
649 # Load SPP library
650 library(spp)
651
652 # Read ChIP tagAlign/BAM files
653 cat("Reading ChIP tagAlign/BAM file",iparams$chip.file,"\n",file=stdout())
654 chip.data <- read.align(ta.chip.filename)
655 cat("ChIP data read length",chip.data$read.length,"\n",file=stdout())
656 file.remove(ta.chip.filename) # Delete temporary file
657 if (length(chip.data$tags)==0) {
658 stop('Error in ChIP file format:', iparams$chip.file)
659 }
660 # Remove illegal chromosome names
661 if (! is.na(iparams$chrname.rm.pattern)) {
662 selectidx <- which(grepl(iparams$chrname.rm.pattern,names(chip.data$tags))==FALSE)
663 chip.data$tags <- chip.data$tags[selectidx]
664 chip.data$quality <- chip.data$quality[selectidx]
665 }
666 chip.data$num.tags <- sum(unlist(lapply(chip.data$tags,function(d) length(d))))
667
668 # Read Control tagAlign/BAM files
669 if (! is.na(iparams$control.file)) {
670 cat("Reading Control tagAlign/BAM file",iparams$control.file,"\n",file=stdout())
671 control.data <- read.align(ta.control.filename)
672 file.remove(ta.control.filename) # Delete temporary file
673 if (length(control.data$tags)==0) {
674 stop('Error in control file format:', iparams$chip.file)
675 }
676 cat("Control data read length",control.data$read.length,"\n",file=stdout())
677 # Remove illegal chromosome names
678 if (! is.na(iparams$chrname.rm.pattern)) {
679 selectidx <- which(grepl(iparams$chrname.rm.pattern,names(control.data$tags))==FALSE)
680 control.data$tags <- control.data$tags[selectidx]
681 control.data$quality <- control.data$quality[selectidx]
682 }
683 control.data$num.tags <- sum(unlist(lapply(control.data$tags,function(d) length(d))))
684 }
685
686 # Open multiple processes if required
687 if (is.na(iparams$n.nodes)) {
688 cluster.nodes <- NULL
689 } else {
690 library(snow)
691 cluster.nodes <- makeCluster(iparams$n.nodes)
692 }
693
694 # #################################
695 # Calculate cross-correlation for various strand shifts
696 # #################################
697 cat("Calculating peak characteristics\n",file=stdout())
698 # crosscorr
699 # $cross.correlation : Cross-correlation profile as an $x/$y data.frame
700 # $peak : Position ($x) and height ($y) of automatically detected cross-correlation peak.
701 # $whs: Optimized window half-size for binding detection (based on the width of the cross-correlation peak)
702 crosscorr <- get.binding.characteristics(chip.data,
703 srange=iparams$sep.range[c(1,3)],
704 bin=iparams$sep.range[2],
705 accept.all.tags=T,
706 cluster=cluster.nodes)
707 if (!is.na(iparams$n.nodes)) {
708 stopCluster(cluster.nodes)
709 }
710
711 # Smooth the cross-correlation curve if required
712 cc <- crosscorr$cross.correlation
713 crosscorr$min.cc <- crosscorr$cross.correlation[ which.min(crosscorr$cross.correlation$y) , ] # minimum value and shift of cross-correlation
714 cat("Minimum cross-correlation value", crosscorr$min.cc$y,"\n",file=stdout())
715 cat("Minimum cross-correlation shift", crosscorr$min.cc$x,"\n",file=stdout())
716 sbw <- 2*floor(ceiling(5/iparams$sep.range[2]) / 2) + 1 # smoothing bandwidth
717 cc$y <- runmean(cc$y,sbw,alg="fast")
718
719 # Compute cross-correlation peak
720 bw <- ceiling(2/iparams$sep.range[2]) # crosscorr[i] is compared to crosscorr[i+/-bw] to find peaks
721 peakidx <- (diff(cc$y,bw)>=0) # cc[i] > cc[i-bw]
722 peakidx <- diff(peakidx,bw)
723 peakidx <- which(peakidx==-1) + bw
724
725 # exclude peaks from the excluded region
726 if ( is.nan(iparams$ex.range[2]) ) {
727 iparams$ex.range[2] <- chip.data$read.length+10
728 }
729 peakidx <- peakidx[(cc$x[peakidx] < iparams$ex.range[1]) | (cc$x[peakidx] > iparams$ex.range[2])]
730 cc <- cc[peakidx,]
731
732 # Find max peak position and other peaks within 0.9*max_peakvalue that are further away from maxpeakposition
733 maxpeakidx <- which.max(cc$y)
734 maxpeakshift <- cc$x[maxpeakidx]
735 maxpeakval <- cc$y[maxpeakidx]
736 peakidx <-which((cc$y >= 0.9*maxpeakval) & (cc$x >= maxpeakshift))
737 cc <- cc[peakidx,]
738
739 # sort the peaks and get the top 3
740 sortidx <- order(cc$y,decreasing=TRUE)
741 sortidx <- sortidx[c(1:min(3,length(sortidx)))]
742 cc.peak <- cc[sortidx,]
743
744 # Override peak shift if user supplies peak shift
745 if (! is.na(iparams$sep.peak)) {
746 cc.peak <- approx(crosscorr$cross.correlation$x,crosscorr$cross.correlation$y,iparams$sep.peak,rule=2)
747 }
748 cat("Peak cross-correlation value", paste(cc.peak$y,collapse=","),"\n",file=stdout())
749 cat("Peak strand shift",paste(cc.peak$x,collapse=","),"\n",file=stdout())
750
751 # Reset values in crosscorr
752 crosscorr$peak$x <- cc.peak$x[1]
753 crosscorr$peak$y <- cc.peak$y[1]
754
755 # Compute window half size
756 whs.thresh <- crosscorr$min.cc$y + (crosscorr$peak$y - crosscorr$min.cc$y)/3
757 crosscorr$whs <- max(crosscorr$cross.correlation$x[crosscorr$cross.correlation$y >= whs.thresh])
758 cat("Window half size",crosscorr$whs,"\n",file=stdout())
759
760 # Compute phantom peak coefficient
761 ph.peakidx <- which( ( crosscorr$cross.correlation$x >= ( chip.data$read.length - round(2*iparams$sep.range[2]) ) ) &
762 ( crosscorr$cross.correlation$x <= ( chip.data$read.length + round(1.5*iparams$sep.range[2]) ) ) )
763 ph.peakidx <- ph.peakidx[ which.max(crosscorr$cross.correlation$y[ph.peakidx]) ]
764 crosscorr$phantom.cc <- crosscorr$cross.correlation[ph.peakidx,]
765 cat("Phantom peak location",crosscorr$phantom.cc$x,"\n",file=stdout())
766 cat("Phantom peak Correlation",crosscorr$phantom.cc$y,"\n",file=stdout())
767 crosscorr$phantom.coeff <- crosscorr$peak$y / crosscorr$phantom.cc$y
768 crosscorr$phantom.coeff <- crosscorr$peak$y / crosscorr$min.cc$y
769 cat("Normalized cross-correlation coefficient (NCCC)",crosscorr$phantom.coeff,"\n",file=stdout())
770 crosscorr$rel.phantom.coeff <- (crosscorr$peak$y - crosscorr$min.cc$y) / (crosscorr$phantom.cc$y - crosscorr$min.cc$y)
771 cat("Relative Cross correlation Coefficient (RCCC)",crosscorr$rel.phantom.coeff,"\n",file=stdout())
772 crosscorr$phantom.quality.tag <- NA
773 if ( (crosscorr$rel.phantom.coeff >= 0) & (crosscorr$rel.phantom.coeff < 0.25) ) {
774 crosscorr$phantom.quality.tag <- -2
775 } else if ( (crosscorr$rel.phantom.coeff >= 0.25) & (crosscorr$rel.phantom.coeff < 0.5) ) {
776 crosscorr$phantom.quality.tag <- -1
777 } else if ( (crosscorr$rel.phantom.coeff >= 0.5) & (crosscorr$rel.phantom.coeff < 1) ) {
778 crosscorr$phantom.quality.tag <- 0
779 } else if ( (crosscorr$rel.phantom.coeff >= 1) & (crosscorr$rel.phantom.coeff < 1.5) ) {
780 crosscorr$phantom.quality.tag <- 1
781 } else if ( (crosscorr$rel.phantom.coeff >= 1.5) ) {
782 crosscorr$phantom.quality.tag <- 2
783 }
784 cat("Phantom Peak Quality Tag",crosscorr$phantom.quality.tag,"\n",file=stdout())
785
786 # Output result to result file if required
787 #Filename\tnumReads\tPeak_shift\tPeak_Correlation\tRead_length\tPhantomPeak_Correlation\tMin_Correlation_Shift\tMin_Correlation\tNormalized_CrossCorrelation_Coefficient\tRelative_CrossCorrelation_Coefficient\tQualityTag)
788 if (! is.na(iparams$output.result.file)) {
789 cat(get.file.parts(iparams$chip.file)$fullname,
790 chip.data$num.tags,
791 paste(cc.peak$x,collapse=","),
792 paste(cc.peak$y,collapse=","),
793 crosscorr$phantom.cc$x,
794 crosscorr$phantom.cc$y,
795 crosscorr$min.cc$x,
796 crosscorr$min.cc$y,
797 crosscorr$phantom.coeff,
798 crosscorr$rel.phantom.coeff,
799 crosscorr$phantom.quality.tag,
800 sep="\t",
801 file=iparams$output.result.file,
802 append=TRUE)
803 cat("\n",
804 file=iparams$output.result.file,
805 append=TRUE)
806 }
807
808 # Save figure if required
809 if (! is.na(iparams$output.plot.file)) {
810 pdf(file=iparams$output.plot.file,width=5,height=5)
811 par(mar = c(4,3.5,2,0.5), mgp = c(1.5,0.5,0), cex = 0.8);
812 plot(crosscorr$cross.correlation,
813 type='l',
814 xlab=sprintf("strand-shift (%s)",paste(cc.peak$x,collapse=",")),
815 ylab="cross-correlation")
816 abline(v=cc.peak$x,lty=2,col=2)
817 abline(v=crosscorr$phantom.cc$x,lty=2,col=4)
818 title(main=get.file.parts(iparams$chip.file)$fullname,
819 sub=sprintf("NSC=%g,RSC=%g,Qtag=%d",crosscorr$phantom.coeff,crosscorr$rel.phantom.coeff,crosscorr$phantom.quality.tag))
820 dev.off();
821 }
822
823 # Save RData file if required
824 if (! is.na(iparams$output.rdata.file)) {
825 save(iparams,
826 crosscorr,
827 cc.peak,
828 file=iparams$output.rdata.file);
829 }
830
831 # #################################
832 # Call peaks
833 # #################################
834
835 if ( !is.na(iparams$output.npeak.file) || !is.na(iparams$output.rpeak.file) ) {
836
837 # Remove local tag anomalies
838 cat('Removing read stacks\n',file=stdout())
839 chip.data <- remove.local.tag.anomalies(chip.data$tags)
840 control.data <- remove.local.tag.anomalies(control.data$tags)
841
842 # Open multiple processes if required
843 if (is.na(iparams$n.nodes)) {
844 cluster.nodes <- NULL
845 } else {
846 cluster.nodes <- makeCluster(iparams$n.nodes)
847 }
848
849 # Find peaks
850 cat('Finding peaks\n',file=stdout())
851 if (!is.na(iparams$npeak)) {
852 iparams$fdr <- 0.96
853 }
854 narrow.peaks <- find.binding.positions(signal.data=chip.data,control.data=control.data,fdr=iparams$fdr,method=tag.lwcc,whs=crosscorr$whs,cluster=cluster.nodes)
855 if (!is.na(iparams$n.nodes)) {
856 stopCluster(cluster.nodes)
857 }
858 cat(paste("Detected",sum(unlist(lapply(narrow.peaks$npl,function(d) length(d$x)))),"peaks"),"\n",file=stdout())
859
860 # Write to narrowPeak file
861 if (!is.na(iparams$output.npeak.file)) {
862 write.narrowpeak.binding(narrow.peaks,iparams$output.npeak.file,margin=round(crosscorr$whs/2),npeaks=iparams$npeak)
863 system(paste('gzip -f ',iparams$output.npeak.file))
864 }
865
866 # Compute and write regionPeak file
867 if (!is.na(iparams$output.rpeak.file)) {
868 region.peaks <- add.broad.peak.regions(chip.data,control.data,narrow.peaks,window.size=max(50,round(crosscorr$whs/4)),z.thr=10)
869 write.narrowpeak.binding(region.peaks,iparams$output.rpeak.file,margin=round(crosscorr$whs/2),npeaks=iparams$npeak)
870 system(paste('gzip -f ',iparams$output.rpeak.file))
871 }
872
873 # Save Rdata file
874 if (! is.na(iparams$output.rdata.file)) {
875 save(iparams,
876 crosscorr,
877 cc.peak,
878 narrow.peaks,
879 region.peaks,
880 file=iparams$output.rdata.file);
881 }
882
883 }
884
885