Mercurial > repos > stemcellcommons > spp_tool
comparison run_spp.R @ 0:b15734276ca3 draft
Initial upload.
| author | stemcellcommons | 
|---|---|
| date | Thu, 17 Oct 2013 12:39:45 -0400 | 
| parents | |
| children | 23b22c1692fa | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:b15734276ca3 | 
|---|---|
| 1 # run_spp.R | |
| 2 # ============= | |
| 3 # Author: Anshul Kundaje, Computer Science Dept., Stanford University | |
| 4 # Email: akundaje@stanford.edu | |
| 5 # Last updated: Oct 8, 2010 | |
| 6 # ============= | |
| 7 # MANDATORY ARGUMENTS | |
| 8 # -c=<ChIP_tagAlign/BAMFile>, full path and name of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) | |
| 9 # MANDATORY ARGUMENT FOR PEAK CALLING | |
| 10 # -i=<Input_tagAlign/BAMFile>, full path and name of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) | |
| 11 # OPTIONAL ARGUMENTS | |
| 12 # -s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-100:5:600 | |
| 13 # -speak=<strPeak>, user-defined cross-correlation peak strandshift | |
| 14 # -x=<min>:<max>, strand shifts to exclude (This is mainly to avoid phantom peaks) default=10:(readlen+10) | |
| 15 # -p=<nodes> , number of parallel processing nodes, default=NULL | |
| 16 # -fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling | |
| 17 # -npeak=<numPeaks>, threshold on number of peaks to call | |
| 18 # -tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used) | |
| 19 # -filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name | |
| 20 # OUTPUT PARAMETERS | |
| 21 # -odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used) | |
| 22 # -savn=<narrowpeakfilename> OR -savn NarrowPeak file name | |
| 23 # -savr=<regionpeakfilename> OR -savr RegionPeak file name | |
| 24 # -savd=<rdatafile> OR -savd , save Rdata file | |
| 25 # -savp=<plotdatafile> OR -savp , save cross-correlation plot | |
| 26 # -out=<resultfile>, append peakshift result to a file | |
| 27 # format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>phantomPeakCoef<tab>relPhantomPeakCoef<tab>QualityTag | |
| 28 # -rf , if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists | |
| 29 # -clean, if present will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files | |
| 30 | |
| 31 args <- commandArgs(trailingOnly=TRUE); # Read Arguments from command line | |
| 32 nargs = length(args); # number of arguments | |
| 33 | |
| 34 # ########################################################################### | |
| 35 # AUXILIARY FUNCTIONS | |
| 36 # ########################################################################### | |
| 37 | |
| 38 print.usage <- function() { | |
| 39 # =================================== | |
| 40 # Function will print function usage | |
| 41 # =================================== | |
| 42 cat('Usage: Rscript run_spp.R <options>\n',file=stderr()) | |
| 43 cat('MANDATORY ARGUMENTS\n',file=stderr()) | |
| 44 cat('-c=<ChIP_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped)(FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) \n',file=stderr()) | |
| 45 cat('MANDATORY ARGUMENTS FOR PEAK CALLING\n',file=stderr()) | |
| 46 cat('-i=<Input_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) \n',file=stderr()) | |
| 47 cat('OPTIONAL ARGUMENTS\n',file=stderr()) | |
| 48 cat('-s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-100:5:600\n',file=stderr()) | |
| 49 cat('-speak=<strPeak>, user-defined cross-correlation peak strandshift\n',file=stderr()) | |
| 50 cat('-x=<min>:<max>, strand shifts to exclude (This is mainly to avoid region around phantom peak) default=10:(readlen+10)\n',file=stderr()) | |
| 51 cat('-p=<nodes> , number of parallel processing nodes, default=0\n',file=stderr()) | |
| 52 cat('-fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling\n',file=stderr()) | |
| 53 cat('-npeak=<numPeaks>, threshold on number of peaks to call\n',file=stderr()) | |
| 54 cat('-tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used)\n',file=stderr()) | |
| 55 cat('-filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name\n',file=stderr()) | |
| 56 cat('OUTPUT ARGUMENTS\n',file=stderr()) | |
| 57 cat('-odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used)\n',file=stderr()) | |
| 58 cat('-savn=<narrowpeakfilename> OR -savn NarrowPeak file name (fixed width peaks)\n',file=stderr()) | |
| 59 cat('-savr=<regionpeakfilename> OR -savr RegionPeak file name (variable width peaks with regions of enrichment)\n',file=stderr()) | |
| 60 cat('-savd=<rdatafile> OR -savd, save Rdata file\n',file=stderr()) | |
| 61 cat('-savp=<plotdatafile> OR -savp, save cross-correlation plot\n',file=stderr()) | |
| 62 cat('-out=<resultfile>, append peakshift/phantomPeak results to a file\n',file=stderr()) | |
| 63 cat(' format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>phantomPeakCoef<tab>relPhantomPeakCoef<tab>QualityTag)\n',file=stderr()) | |
| 64 cat('-rf, if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists\n',file=stderr()) | |
| 65 cat('-clean, if present will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files\n',file=stderr()) | |
| 66 } # end: print.usage() | |
| 67 | |
| 68 get.file.parts <- function(file.fullpath) { | |
| 69 # =================================== | |
| 70 # Function will take a file name with path and split the file name into | |
| 71 # path, fullname, name and ext | |
| 72 # =================================== | |
| 73 if (! is.character(file.fullpath)) { | |
| 74 stop('File name must be a string') | |
| 75 } | |
| 76 | |
| 77 file.parts <- strsplit(as.character(file.fullpath), .Platform$file.sep, fixed=TRUE)[[1]] # split on file separator | |
| 78 | |
| 79 if (length(file.parts) == 0) { # if empty file name | |
| 80 return(list(path='', | |
| 81 fullname='', | |
| 82 name='', | |
| 83 ext='') | |
| 84 ) | |
| 85 } else { | |
| 86 if (length(file.parts) == 1) { # if no path then just the file name itself | |
| 87 file.path <- '.' | |
| 88 file.fullname <- file.parts | |
| 89 } else { | |
| 90 file.path <- paste(file.parts[1:(length(file.parts)-1)], collapse=.Platform$file.sep) # 1:last-1 token is path | |
| 91 file.fullname <- file.parts[length(file.parts)] # last token is filename | |
| 92 } | |
| 93 file.fullname.parts <- strsplit(file.fullname,'.',fixed=TRUE)[[1]] # split on . | |
| 94 if (length(file.fullname.parts) == 1) { # if no extension | |
| 95 file.ext <- '' | |
| 96 file.name <- file.fullname.parts | |
| 97 } else { | |
| 98 file.ext <- paste('.', file.fullname.parts[length(file.fullname.parts)], sep="") # add the . to the last token | |
| 99 file.name <- paste(file.fullname.parts[1:(length(file.fullname.parts)-1)], collapse=".") | |
| 100 } | |
| 101 return(list(path=file.path, | |
| 102 fullname=file.fullname, | |
| 103 name=file.name, | |
| 104 ext=file.ext)) | |
| 105 } | |
| 106 } # end: get.file.parts() | |
| 107 | |
| 108 parse.arguments <- function(args) { | |
| 109 # =================================== | |
| 110 # Function will parse arguments | |
| 111 # =================================== | |
| 112 # Set arguments to default values | |
| 113 chip.file <- NA # main ChIP tagAlign/BAM file name | |
| 114 isurl.chip.file <- FALSE # flag indicating whether ChIP file is a URL | |
| 115 control.file <- NA # control tagAlign/BAM file name | |
| 116 isurl.control.file <- FALSE # flag indicating whether control file is a URL | |
| 117 sep.min <- -100 # min strand shift | |
| 118 sep.max <- 600 # max strand shift | |
| 119 sep.bin <- 5 # increment for strand shift | |
| 120 sep.peak <- NA # user-defined peak shift | |
| 121 exclude.min <- 10 # lowerbound of strand shift exclusion region | |
| 122 exclude.max <- NaN # upperbound of strand shift exclusion region | |
| 123 n.nodes <- NA # number of parallel processing nodes | |
| 124 fdr <- 0.01 # false discovery rate threshold for peak calling | |
| 125 npeak <- NA # threshold on number of peaks to call | |
| 126 temp.dir <- tempdir() # temporary directory | |
| 127 chrname.rm.pattern <- NA # chromosome name pattern used to remove tags | |
| 128 output.odir <- NA # Output directory name | |
| 129 output.npeak.file <- NA # Output narrowPeak file name | |
| 130 output.rpeak.file <- NA # Output regionPeak file name | |
| 131 output.rdata.file <- NA # Rdata file | |
| 132 output.plot.file <- NA # cross correlation plot file | |
| 133 output.result.file <- NA # result file | |
| 134 replace.flag <- FALSE # replace file flag | |
| 135 clean.files.flag <- FALSE # file deletion flag | |
| 136 | |
| 137 # Parse arguments | |
| 138 for (each.arg in args) { | |
| 139 | |
| 140 if (grepl('^-c=',each.arg)) { #-c=<chip.file> | |
| 141 | |
| 142 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 143 if (! is.na(arg.split[2]) ) { | |
| 144 chip.file <- arg.split[2] # second part is chip.file | |
| 145 } else { | |
| 146 stop('No tagAlign/BAM file name provided for parameter -c=') | |
| 147 } | |
| 148 | |
| 149 } else if (grepl('^-i=',each.arg)) { #-i=<control.file> | |
| 150 | |
| 151 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 152 if (! is.na(arg.split[2]) ) { | |
| 153 control.file <- arg.split[2] # second part is control.file | |
| 154 } else { | |
| 155 stop('No tagAlign/BAM file name provided for parameter -i=') | |
| 156 } | |
| 157 | |
| 158 } else if (grepl('^-s=',each.arg)) { #-s=<sep.min>:<sep.bin>:<sep.max> | |
| 159 | |
| 160 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 161 if (! is.na(arg.split[2]) ) { | |
| 162 sep.vals <- arg.split[2] # second part is sepmin:sepbin:sepmax | |
| 163 sep.vals.split <- strsplit(sep.vals,':',fixed=TRUE)[[1]] # split on : | |
| 164 if (length(sep.vals.split) != 3) { # must have 3 parts | |
| 165 stop('Strand shift limits must be specified as -s=sepmin:sepbin:sepmax') | |
| 166 } else { | |
| 167 if (any(is.na(as.numeric(sep.vals.split)))) { # check that sep vals are numeric | |
| 168 stop('Strand shift limits must be numeric values') | |
| 169 } | |
| 170 sep.min <- round(as.numeric(sep.vals.split[1])) | |
| 171 sep.bin <- round(as.numeric(sep.vals.split[2])) | |
| 172 sep.max <- round(as.numeric(sep.vals.split[3])) | |
| 173 if ((sep.min > sep.max) || (sep.bin > (sep.max - sep.min)) || (sep.bin < 0)) { | |
| 174 stop('Illegal separation values -s=sepmin:sepbin:sepmax') | |
| 175 } | |
| 176 } | |
| 177 } else { | |
| 178 stop('Strand shift limits must be specified as -s=sepmin:sepbin:sepmax') | |
| 179 } | |
| 180 | |
| 181 } else if (grepl('^-speak=',each.arg)) { #-speak=<sep.peak> , user-defined cross-correlation peak strandshift | |
| 182 | |
| 183 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 184 if (! is.na(arg.split[2]) ) { | |
| 185 sep.peak <- arg.split[2] # second part is <sep.peak> | |
| 186 if (is.na(as.numeric(sep.peak))) { # check that sep.peak is numeric | |
| 187 stop('-speak=<sep.peak>: User defined peak shift must be numeric') | |
| 188 } | |
| 189 sep.peak <- as.numeric(sep.peak) | |
| 190 } else { | |
| 191 stop('User defined peak shift must be provided as -speak=<sep.peak>') | |
| 192 } | |
| 193 | |
| 194 } else if (grepl('^-x=',each.arg)) { #-x=<exclude.min>:<exclude.max> | |
| 195 | |
| 196 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 197 if (! is.na(arg.split[2]) ) { | |
| 198 exclude.vals <- arg.split[2] # second part is excludemin:excludemax | |
| 199 exclude.vals.split <- strsplit(exclude.vals,':',fixed=TRUE)[[1]] # split on : | |
| 200 if (length(exclude.vals.split) != 2) { # must have 2 parts | |
| 201 stop('Exclusion limits must be specified as -x=excludemin:excludemax') | |
| 202 } else { | |
| 203 if (any(is.na(as.numeric(exclude.vals.split)))) { # check that exclude vals are numeric | |
| 204 stop('Exclusion limits must be numeric values') | |
| 205 } | |
| 206 exclude.min <- round(as.numeric(exclude.vals.split[1])) | |
| 207 exclude.max <- round(as.numeric(exclude.vals.split[2])) | |
| 208 if (exclude.min > exclude.max) { | |
| 209 stop('Illegal exclusion limits -x=excludemin:excludemax') | |
| 210 } | |
| 211 } | |
| 212 } else { | |
| 213 stop('Exclusion limits must be specified as -x=excludemin:excludemax') | |
| 214 } | |
| 215 | |
| 216 } else if (grepl('^-p=',each.arg)) { #-p=<n.nodes> , number of parallel processing nodes, default=NULL | |
| 217 | |
| 218 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 219 if (! is.na(arg.split[2]) ) { | |
| 220 n.nodes <- arg.split[2] # second part is numnodes | |
| 221 if (is.na(as.numeric(n.nodes))) { # check that n.nodes is numeric | |
| 222 stop('-p=<numnodes>: numnodes must be numeric') | |
| 223 } | |
| 224 n.nodes <- round(as.numeric(n.nodes)) | |
| 225 } else { | |
| 226 stop('Number of parallel nodes must be provided as -p=<numnodes>') | |
| 227 } | |
| 228 | |
| 229 } else if (grepl('^-fdr=',each.arg)) { #-fdr=<fdr> , false discovery rate, default=0.01 | |
| 230 | |
| 231 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 232 if (! is.na(arg.split[2]) ) { | |
| 233 fdr <- arg.split[2] # second part is fdr | |
| 234 if (is.na(as.numeric(fdr))) { # check that fdr is numeric | |
| 235 stop('-fdr=<falseDiscoveryRate>: false discovery rate must be numeric') | |
| 236 } | |
| 237 fdr <- as.numeric(fdr) | |
| 238 } else { | |
| 239 stop('False discovery rate must be provided as -fdr=<fdr>') | |
| 240 } | |
| 241 | |
| 242 } else if (grepl('^-npeak=',each.arg)) { #-npeak=<numPeaks> , number of peaks threshold, default=NA | |
| 243 | |
| 244 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 245 if (! is.na(arg.split[2]) ) { | |
| 246 npeak <- arg.split[2] # second part is npeak | |
| 247 if (is.na(as.numeric(npeak))) { # check that npeak is numeric | |
| 248 stop('-npeak=<numPeaks>: threshold on number of peaks must be numeric') | |
| 249 } | |
| 250 npeak <- round(as.numeric(npeak)) | |
| 251 } else { | |
| 252 stop('Threshold on number of peaks must be provided as -npeak=<numPeaks>') | |
| 253 } | |
| 254 | |
| 255 } else if (grepl('^-tmpdir=',each.arg)) { #-tmpdir=<temp.dir> | |
| 256 | |
| 257 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 258 if (! is.na(arg.split[2]) ) { | |
| 259 temp.dir <- arg.split[2] # second part is temp.dir | |
| 260 } else { | |
| 261 stop('No temporary directory provided for parameter -tmpdir=') | |
| 262 } | |
| 263 | |
| 264 } else if (grepl('^-filtchr=',each.arg)) { #-filtchr=<chrname.rm.pattern> | |
| 265 | |
| 266 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 267 if (! is.na(arg.split[2]) ) { | |
| 268 chrname.rm.pattern <- arg.split[2] # second part is chrname.rm.pattern | |
| 269 } else { | |
| 270 stop('No pattern provided for parameter -filtchr=') | |
| 271 } | |
| 272 | |
| 273 } else if (grepl('^-odir=',each.arg)) { #-odir=<output.odir> | |
| 274 | |
| 275 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 276 if (! is.na(arg.split[2]) ) { | |
| 277 output.odir <- arg.split[2] # second part is output.odir | |
| 278 } else { | |
| 279 stop('No output directory provided for parameter -odir=') | |
| 280 } | |
| 281 | |
| 282 } else if (grepl('^-savn',each.arg)) { # -savn=<output.npeak.file> OR -savn , save narrowpeak | |
| 283 | |
| 284 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 285 if (! is.na(arg.split[2])) { | |
| 286 output.npeak.file <- arg.split[2] #-savn= | |
| 287 } else if (each.arg=='-savn') { | |
| 288 output.npeak.file <- NULL # NULL indicates get the name from the main file name | |
| 289 } else { | |
| 290 stop('Argument for saving narrowPeak file must be -savn or -savn=<filename>') | |
| 291 } | |
| 292 | |
| 293 } else if (grepl('^-savr',each.arg)) { # -savr=<output.rpeak.file> OR -savr , save regionpeak | |
| 294 | |
| 295 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 296 if (! is.na(arg.split[2])) { | |
| 297 output.rpeak.file <- arg.split[2] #-savr= | |
| 298 } else if (each.arg=='-savr') { | |
| 299 output.rpeak.file <- NULL # NULL indicates get the name from the main file name | |
| 300 } else { | |
| 301 stop('Argument for saving regionPeak file must be -savr or -savr=<filename>') | |
| 302 } | |
| 303 | |
| 304 } else if (grepl('^-savd',each.arg)) { # -savd=<output.rdata.file> OR -savd , save Rdata file | |
| 305 | |
| 306 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 307 if (! is.na(arg.split[2])) { | |
| 308 output.rdata.file <- arg.split[2] #-savd= | |
| 309 } else if (each.arg=='-savd') { | |
| 310 output.rdata.file <- NULL # NULL indicates get the name from the main file name | |
| 311 } else { | |
| 312 stop('Argument for saving Rdata file must be -savd or -savd=<filename>') | |
| 313 } | |
| 314 | |
| 315 } else if (grepl('^-savp',each.arg)) { # -savp=<output.plot.file> OR -savp , save cross-correlation plot | |
| 316 | |
| 317 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 318 if (! is.na(arg.split[2])) { | |
| 319 output.plot.file <- arg.split[2] #-savp= | |
| 320 } else if (each.arg=='-savp') { | |
| 321 output.plot.file <- NULL # NULL indicates get the name from the main file name | |
| 322 } else { | |
| 323 stop('Argument for saving Rdata file must be -savp or -savp=<filename>') | |
| 324 } | |
| 325 | |
| 326 } else if (grepl('^-out=',each.arg)) { #-out=<output.result.file> | |
| 327 | |
| 328 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = | |
| 329 if (! is.na(arg.split[2]) ) { | |
| 330 output.result.file <- arg.split[2] # second part is output.result.file | |
| 331 } else { | |
| 332 stop('No result file provided for parameter -out=') | |
| 333 } | |
| 334 | |
| 335 } else if (each.arg == '-rf') { | |
| 336 | |
| 337 replace.flag <- TRUE | |
| 338 | |
| 339 } else if (each.arg == '-clean') { | |
| 340 | |
| 341 clean.files.flag <- TRUE | |
| 342 | |
| 343 } else { | |
| 344 | |
| 345 stop('Illegal argument ',each.arg) | |
| 346 } | |
| 347 } | |
| 348 # End: for loop | |
| 349 | |
| 350 # Check mandatory arguments | |
| 351 if (is.na(chip.file)) { | |
| 352 stop('-c=<tagAlign/BAMFileName> is a mandatory argument') | |
| 353 } | |
| 354 | |
| 355 if (is.na(control.file) && ! is.na(output.npeak.file)) { | |
| 356 stop('-i=<tagAlign/BAMFileName> is required for peak calling') | |
| 357 } | |
| 358 | |
| 359 # Check if ChIP and control files are URLs | |
| 360 if (grepl('^http://',chip.file)) { | |
| 361 isurl.chip.file <- TRUE | |
| 362 } | |
| 363 if (grepl('^http://',control.file)) { | |
| 364 isurl.control.file <- TRUE | |
| 365 } | |
| 366 | |
| 367 # If ChIP file is a URL output.odir MUST be specified | |
| 368 if (isurl.chip.file && is.na(output.odir)) { | |
| 369 stop('If ChIP file is a URL, then output directory MUST be specified') | |
| 370 } | |
| 371 | |
| 372 # Check that ChIP and control files exist | |
| 373 if (isurl.chip.file) { | |
| 374 if (system(paste('wget -q --spider',chip.file)) != 0) { | |
| 375 stop('ChIP file URL not valid: ',chip.file) | |
| 376 } | |
| 377 } else if (!file.exists(chip.file)) { | |
| 378 stop('ChIP File:',chip.file,' does not exist') | |
| 379 } | |
| 380 | |
| 381 if (!is.na(control.file)) { | |
| 382 if (isurl.control.file) { | |
| 383 if (system(paste('wget -q --spider',control.file)) != 0) { | |
| 384 stop('Control file URL not valid: ',control.file) | |
| 385 } | |
| 386 } else if (!file.exists(control.file)) { | |
| 387 stop('Control File:',control.file,' does not exist') | |
| 388 } | |
| 389 } | |
| 390 | |
| 391 # Correct other arguments | |
| 392 if (is.na(output.odir)) { # Reconstruct output.odir if not provided | |
| 393 output.odir <- get.file.parts(chip.file)$path | |
| 394 } | |
| 395 | |
| 396 if (is.null(output.npeak.file)) { # Reconstruct output.npeak.file if NULL | |
| 397 output.npeak.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '_VS_', get.file.parts(control.file)$name,'.narrowPeak', sep="")) | |
| 398 } | |
| 399 | |
| 400 if (is.null(output.rpeak.file)) { # Reconstruct output.rpeak.file if NULL | |
| 401 output.rpeak.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '_VS_', get.file.parts(control.file)$name,'.regionPeak', sep="")) | |
| 402 } | |
| 403 | |
| 404 if (is.null(output.rdata.file)) { # Reconstruct output.rdata.file if NULL | |
| 405 output.rdata.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '.Rdata', sep="")) | |
| 406 } | |
| 407 | |
| 408 if (is.null(output.plot.file)) { # Reconstruct output.plot.file if NULL | |
| 409 output.plot.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '.pdf', sep="")) | |
| 410 } | |
| 411 | |
| 412 return(list(chip.file=chip.file, | |
| 413 isurl.chip.file=isurl.chip.file, | |
| 414 control.file=control.file, | |
| 415 isurl.control.file=isurl.control.file, | |
| 416 sep.range=c(sep.min,sep.bin,sep.max), | |
| 417 sep.peak=sep.peak, | |
| 418 ex.range=c(exclude.min,exclude.max), | |
| 419 n.nodes=n.nodes, | |
| 420 fdr=fdr, | |
| 421 npeak=npeak, | |
| 422 temp.dir=temp.dir, | |
| 423 chrname.rm.pattern=chrname.rm.pattern, | |
| 424 output.odir=output.odir, | |
| 425 output.npeak.file=output.npeak.file, | |
| 426 output.rpeak.file=output.rpeak.file, | |
| 427 output.rdata.file=output.rdata.file, | |
| 428 output.plot.file=output.plot.file, | |
| 429 output.result.file=output.result.file, | |
| 430 replace.flag=replace.flag, | |
| 431 clean.files.flag=clean.files.flag)) | |
| 432 } # end: parse.arguments() | |
| 433 | |
| 434 read.align <- function(align.filename) { | |
| 435 # =================================== | |
| 436 # Function will read a tagAlign or BAM file | |
| 437 # =================================== | |
| 438 if (grepl('(\\.bam)?.*(\\.tagAlign)',align.filename)) { # if tagalign file | |
| 439 chip.data <- read.tagalign.tags(align.filename) | |
| 440 # get readlength info | |
| 441 tmpDataRows <- read.table(align.filename,nrows=500) | |
| 442 chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2)) | |
| 443 } else if (grepl('(\\.tagAlign)?.*(\\.bam)',align.filename)) { # if bam file | |
| 444 # create BAM file name | |
| 445 bam2align.filename <- sub('\\.bam','.tagAlign',align.filename) | |
| 446 # generate command to convert bam to tagalign | |
| 447 command <- vector(length=2) | |
| 448 command[1] <- sprintf("samtools view -F 0x0204 -o - %s",align.filename) | |
| 449 command[2] <- paste("awk 'BEGIN{FS=" , '"\t"' , ";OFS=", '"\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}', "' 1> ", bam2align.filename, sep="") | |
| 450 # command[2] <- paste("awk 'BEGIN{OFS=", '"\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}', "' 1> ", bam2align.filename, sep="") | |
| 451 command <- paste(command,collapse=" | ") | |
| 452 # Run command | |
| 453 status <- system(command,intern=FALSE,ignore.stderr=FALSE) | |
| 454 if ((status != 0) || !file.exists(bam2align.filename)) { | |
| 455 cat(sprintf("Error converting BAM to tagalign file: %s\n",align.filename),file=stderr()) | |
| 456 q(save="no",status=1) | |
| 457 } | |
| 458 # read converted BAM file | |
| 459 chip.data <- read.tagalign.tags(bam2align.filename) | |
| 460 # get readlength info | |
| 461 tmpDataRows <- read.table(bam2align.filename,nrows=500) | |
| 462 chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2)) | |
| 463 # delete temporary tagalign file | |
| 464 file.remove(bam2align.filename) | |
| 465 } else { | |
| 466 cat(sprintf("Error:Unknown file format for file:%s\n",align.fname),file=stderr()) | |
| 467 q(save="no",status=1) | |
| 468 } | |
| 469 return(chip.data) | |
| 470 } # end: read.align() | |
| 471 | |
| 472 print.run.params <- function(params){ | |
| 473 # =================================== | |
| 474 # Output run parameters | |
| 475 # =================================== | |
| 476 cat('################\n',file=stdout()) | |
| 477 cat(iparams$chip.file, | |
| 478 iparams$control.file, | |
| 479 iparams$sep.range, | |
| 480 iparams$sep.peak, | |
| 481 iparams$ex.range, | |
| 482 iparams$n.nodes, | |
| 483 iparams$fdr, | |
| 484 iparams$npeak, | |
| 485 iparams$output.odir, | |
| 486 iparams$output.npeak.file, | |
| 487 iparams$output.rpeak.file, | |
| 488 iparams$output.rdata.file, | |
| 489 iparams$output.plot.file, | |
| 490 iparams$output.result.file, | |
| 491 iparams$replace.flag, | |
| 492 labels=c('ChIP data:','Control data:', 'strandshift(min):','strandshift(step):','strandshift(max)','user-defined peak shift', | |
| 493 'exclusion(min):','exclusion(max):','num parallel nodes:','FDR threshold:','NumPeaks Threshold:','Output Directory:', | |
| 494 'narrowPeak output file name:', 'regionPeak output file name:', 'Rdata filename:', | |
| 495 'plot pdf filename:','result filename:','Overwrite files?:'), | |
| 496 fill=18, | |
| 497 file=stdout()) | |
| 498 cat('\n',file=stdout()) | |
| 499 } # end: print.run.parameters() | |
| 500 | |
| 501 check.replace.flag <- function(params){ | |
| 502 # =================================== | |
| 503 # Check if files exist | |
| 504 # =================================== | |
| 505 # If replace.flag is NOT set, check if output files exist and abort if necessary | |
| 506 if (! iparams$replace.flag) { | |
| 507 if (! is.na(iparams$output.npeak.file)) { | |
| 508 if (file.exists(iparams$output.npeak.file)) { | |
| 509 cat('narrowPeak file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr()) | |
| 510 q(save="no",status=1) | |
| 511 } | |
| 512 } | |
| 513 if (! is.na(iparams$output.rpeak.file)) { | |
| 514 if (file.exists(iparams$output.rpeak.file)) { | |
| 515 cat('regionPeak file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr()) | |
| 516 q(save="no",status=1) | |
| 517 } | |
| 518 } | |
| 519 if (! is.na(iparams$output.plot.file)) { | |
| 520 if (file.exists(iparams$output.plot.file)) { | |
| 521 cat('Plot file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr()) | |
| 522 q(save="no",status=1) | |
| 523 } | |
| 524 } | |
| 525 if (! is.na(iparams$output.rdata.file)) { | |
| 526 if (file.exists(iparams$output.rdata.file)) { | |
| 527 cat('Rdata file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr()) | |
| 528 q(save="no",status=1) | |
| 529 } | |
| 530 } | |
| 531 } | |
| 532 } | |
| 533 | |
| 534 # ############################################################################# | |
| 535 # MAIN FUNCTION | |
| 536 # ############################################################################# | |
| 537 | |
| 538 # Check number of arguments | |
| 539 minargs = 1; | |
| 540 maxargs = 17; | |
| 541 if (nargs < minargs | nargs > maxargs) { | |
| 542 print.usage() | |
| 543 q(save="no",status=1) | |
| 544 } | |
| 545 | |
| 546 # Parse arguments | |
| 547 # iparams$chip.file | |
| 548 # iparams$isurl.chip.file | |
| 549 # iparams$control.file | |
| 550 # iparams$isurl.control.file | |
| 551 # iparams$sep.range | |
| 552 # iparams$sep.peak | |
| 553 # iparams$ex.range | |
| 554 # iparams$n.nodes | |
| 555 # iparams$fdr | |
| 556 # iparams$npeak | |
| 557 # iparams$temp.dir | |
| 558 # iparams$output.odir | |
| 559 # iparams$output.npeak.file | |
| 560 # iparams$output.rpeak.file | |
| 561 # iparams$output.rdata.file | |
| 562 # iparams$output.plot.file | |
| 563 # iparams$output.result.file | |
| 564 # iparams$replace.flag | |
| 565 # iparams$clean.files.flag | |
| 566 iparams <- parse.arguments(args) | |
| 567 | |
| 568 # Print run parameters | |
| 569 print.run.params(iparams) | |
| 570 | |
| 571 # Check if output files exist | |
| 572 check.replace.flag(iparams) | |
| 573 | |
| 574 # curr.chip.file and curr.control.file always point to the original ChIP and control files on disk | |
| 575 # ta.chip.filename & ta.control.filename always point to the final but temporary versions of the ChIP and control files that will be passed to read.align | |
| 576 | |
| 577 # Download ChIP and control files if necessary to temp.dir | |
| 578 if (iparams$isurl.chip.file) { | |
| 579 curr.chip.file <- file.path(iparams$temp.dir, get.file.parts(iparams$chip.file)$fullname) # file is downloaded to temp.dir. Has same name as URL suffix | |
| 580 cat('Downloading ChIP file:',iparams$chip.file,"\n",file=stdout()) | |
| 581 if (system(paste('wget -N -q -P',iparams$temp.dir,iparams$chip.file)) != 0) { | |
| 582 stop('Error downloading ChIP file:',iparams$chip.file) | |
| 583 } | |
| 584 } else { | |
| 585 curr.chip.file <- iparams$chip.file # file is in original directory | |
| 586 } | |
| 587 | |
| 588 if (iparams$isurl.control.file) { | |
| 589 curr.control.file <- file.path(iparams$temp.dir, get.file.parts(iparams$control.file)$fullname) # file is downloaded to temp.dir. Has same name as URL suffix | |
| 590 cat('Downloading control file:',iparams$control.file,"\n",file=stdout()) | |
| 591 if (system(paste('wget -N -q -P',iparams$temp.dir,iparams$control.file)) != 0) { | |
| 592 stop('Error downloading Control file:',iparams$control.file) | |
| 593 } | |
| 594 } else { | |
| 595 curr.control.file <- iparams$control.file # file is in original directory | |
| 596 } | |
| 597 | |
| 598 # unzip ChIP and input files if required AND copy to temp directory | |
| 599 if (get.file.parts(curr.chip.file)$ext == '.gz') { | |
| 600 ta.chip.filename <- tempfile(get.file.parts(curr.chip.file)$name, tmpdir=iparams$temp.dir) # unzip file to temp.dir/[filename with .gz removed][randsuffix] | |
| 601 cat('Decompressing ChIP file\n',file=stdout()) | |
| 602 if (system(paste("gunzip -c",curr.chip.file,">",ta.chip.filename)) != 0) { | |
| 603 stop('Unable to decompress file:', iparams$chip.file) | |
| 604 } | |
| 605 if (iparams$clean.files.flag) { # Remove original file if clean.files.flag is set | |
| 606 file.remove(curr.chip.file) | |
| 607 } | |
| 608 } else { | |
| 609 ta.chip.filename <- tempfile(get.file.parts(curr.chip.file)$fullname, tmpdir=iparams$temp.dir) | |
| 610 if (iparams$clean.files.flag) { | |
| 611 file.rename(curr.chip.file,ta.chip.filename) # move file to temp.dir/[filename][randsuffix] | |
| 612 } else { | |
| 613 file.copy(curr.chip.file,ta.chip.filename) # copy file to temp.dir/[filename][randsuffix] | |
| 614 } | |
| 615 } | |
| 616 | |
| 617 if (! is.na(iparams$control.file)) { | |
| 618 if (get.file.parts(curr.control.file)$ext == '.gz') { | |
| 619 ta.control.filename <- tempfile(get.file.parts(curr.control.file)$name, tmpdir=iparams$temp.dir) # unzip file to temp.dir/[filename with .gz removed][randsuffix] | |
| 620 cat('Decompressing control file\n',file=stdout()) | |
| 621 if (system(paste("gunzip -c",curr.control.file,">",ta.control.filename)) != 0) { | |
| 622 stop('Unable to decompress file:', iparams$control.file) | |
| 623 } | |
| 624 if (iparams$clean.files.flag) { # Remove original file if clean.files.flag is set | |
| 625 file.remove(curr.control.file) | |
| 626 } | |
| 627 } else { | |
| 628 ta.control.filename <- tempfile(get.file.parts(curr.control.file)$fullname, tmpdir=iparams$temp.dir) # copy file to temp.dir/[filename][randsuffix] | |
| 629 | |
| 630 if (iparams$clean.files.flag) { | |
| 631 file.rename(curr.control.file,ta.control.filename) # move file to temp.dir/[filename][randsuffix] | |
| 632 } else { | |
| 633 file.copy(curr.control.file,ta.control.filename) # copy file to temp.dir/[filename][randsuffix] | |
| 634 } | |
| 635 } | |
| 636 } | |
| 637 | |
| 638 # Remove downloaded files | |
| 639 if (iparams$isurl.chip.file & file.exists(curr.chip.file)) { | |
| 640 file.remove(curr.chip.file) | |
| 641 } | |
| 642 | |
| 643 if (! is.na(iparams$control.file)) { | |
| 644 if (iparams$isurl.control.file & file.exists(curr.control.file)) { | |
| 645 file.remove(curr.control.file) | |
| 646 } | |
| 647 } | |
| 648 | |
| 649 # Load SPP library | |
| 650 library(spp) | |
| 651 | |
| 652 # Read ChIP tagAlign/BAM files | |
| 653 cat("Reading ChIP tagAlign/BAM file",iparams$chip.file,"\n",file=stdout()) | |
| 654 chip.data <- read.align(ta.chip.filename) | |
| 655 cat("ChIP data read length",chip.data$read.length,"\n",file=stdout()) | |
| 656 file.remove(ta.chip.filename) # Delete temporary file | |
| 657 if (length(chip.data$tags)==0) { | |
| 658 stop('Error in ChIP file format:', iparams$chip.file) | |
| 659 } | |
| 660 # Remove illegal chromosome names | |
| 661 if (! is.na(iparams$chrname.rm.pattern)) { | |
| 662 selectidx <- which(grepl(iparams$chrname.rm.pattern,names(chip.data$tags))==FALSE) | |
| 663 chip.data$tags <- chip.data$tags[selectidx] | |
| 664 chip.data$quality <- chip.data$quality[selectidx] | |
| 665 } | |
| 666 chip.data$num.tags <- sum(unlist(lapply(chip.data$tags,function(d) length(d)))) | |
| 667 | |
| 668 # Read Control tagAlign/BAM files | |
| 669 if (! is.na(iparams$control.file)) { | |
| 670 cat("Reading Control tagAlign/BAM file",iparams$control.file,"\n",file=stdout()) | |
| 671 control.data <- read.align(ta.control.filename) | |
| 672 file.remove(ta.control.filename) # Delete temporary file | |
| 673 if (length(control.data$tags)==0) { | |
| 674 stop('Error in control file format:', iparams$chip.file) | |
| 675 } | |
| 676 cat("Control data read length",control.data$read.length,"\n",file=stdout()) | |
| 677 # Remove illegal chromosome names | |
| 678 if (! is.na(iparams$chrname.rm.pattern)) { | |
| 679 selectidx <- which(grepl(iparams$chrname.rm.pattern,names(control.data$tags))==FALSE) | |
| 680 control.data$tags <- control.data$tags[selectidx] | |
| 681 control.data$quality <- control.data$quality[selectidx] | |
| 682 } | |
| 683 control.data$num.tags <- sum(unlist(lapply(control.data$tags,function(d) length(d)))) | |
| 684 } | |
| 685 | |
| 686 # Open multiple processes if required | |
| 687 if (is.na(iparams$n.nodes)) { | |
| 688 cluster.nodes <- NULL | |
| 689 } else { | |
| 690 library(snow) | |
| 691 cluster.nodes <- makeCluster(iparams$n.nodes) | |
| 692 } | |
| 693 | |
| 694 # ################################# | |
| 695 # Calculate cross-correlation for various strand shifts | |
| 696 # ################################# | |
| 697 cat("Calculating peak characteristics\n",file=stdout()) | |
| 698 # crosscorr | |
| 699 # $cross.correlation : Cross-correlation profile as an $x/$y data.frame | |
| 700 # $peak : Position ($x) and height ($y) of automatically detected cross-correlation peak. | |
| 701 # $whs: Optimized window half-size for binding detection (based on the width of the cross-correlation peak) | |
| 702 crosscorr <- get.binding.characteristics(chip.data, | |
| 703 srange=iparams$sep.range[c(1,3)], | |
| 704 bin=iparams$sep.range[2], | |
| 705 accept.all.tags=T, | |
| 706 cluster=cluster.nodes) | |
| 707 if (!is.na(iparams$n.nodes)) { | |
| 708 stopCluster(cluster.nodes) | |
| 709 } | |
| 710 | |
| 711 # Smooth the cross-correlation curve if required | |
| 712 cc <- crosscorr$cross.correlation | |
| 713 crosscorr$min.cc <- crosscorr$cross.correlation[ which.min(crosscorr$cross.correlation$y) , ] # minimum value and shift of cross-correlation | |
| 714 cat("Minimum cross-correlation value", crosscorr$min.cc$y,"\n",file=stdout()) | |
| 715 cat("Minimum cross-correlation shift", crosscorr$min.cc$x,"\n",file=stdout()) | |
| 716 sbw <- 2*floor(ceiling(5/iparams$sep.range[2]) / 2) + 1 # smoothing bandwidth | |
| 717 cc$y <- runmean(cc$y,sbw,alg="fast") | |
| 718 | |
| 719 # Compute cross-correlation peak | |
| 720 bw <- ceiling(2/iparams$sep.range[2]) # crosscorr[i] is compared to crosscorr[i+/-bw] to find peaks | |
| 721 peakidx <- (diff(cc$y,bw)>=0) # cc[i] > cc[i-bw] | |
| 722 peakidx <- diff(peakidx,bw) | |
| 723 peakidx <- which(peakidx==-1) + bw | |
| 724 | |
| 725 # exclude peaks from the excluded region | |
| 726 if ( is.nan(iparams$ex.range[2]) ) { | |
| 727 iparams$ex.range[2] <- chip.data$read.length+10 | |
| 728 } | |
| 729 peakidx <- peakidx[(cc$x[peakidx] < iparams$ex.range[1]) | (cc$x[peakidx] > iparams$ex.range[2])] | |
| 730 cc <- cc[peakidx,] | |
| 731 | |
| 732 # Find max peak position and other peaks within 0.9*max_peakvalue that are further away from maxpeakposition | |
| 733 maxpeakidx <- which.max(cc$y) | |
| 734 maxpeakshift <- cc$x[maxpeakidx] | |
| 735 maxpeakval <- cc$y[maxpeakidx] | |
| 736 peakidx <-which((cc$y >= 0.9*maxpeakval) & (cc$x >= maxpeakshift)) | |
| 737 cc <- cc[peakidx,] | |
| 738 | |
| 739 # sort the peaks and get the top 3 | |
| 740 sortidx <- order(cc$y,decreasing=TRUE) | |
| 741 sortidx <- sortidx[c(1:min(3,length(sortidx)))] | |
| 742 cc.peak <- cc[sortidx,] | |
| 743 | |
| 744 # Override peak shift if user supplies peak shift | |
| 745 if (! is.na(iparams$sep.peak)) { | |
| 746 cc.peak <- approx(crosscorr$cross.correlation$x,crosscorr$cross.correlation$y,iparams$sep.peak,rule=2) | |
| 747 } | |
| 748 cat("Peak cross-correlation value", paste(cc.peak$y,collapse=","),"\n",file=stdout()) | |
| 749 cat("Peak strand shift",paste(cc.peak$x,collapse=","),"\n",file=stdout()) | |
| 750 | |
| 751 # Reset values in crosscorr | |
| 752 crosscorr$peak$x <- cc.peak$x[1] | |
| 753 crosscorr$peak$y <- cc.peak$y[1] | |
| 754 | |
| 755 # Compute window half size | |
| 756 whs.thresh <- crosscorr$min.cc$y + (crosscorr$peak$y - crosscorr$min.cc$y)/3 | |
| 757 crosscorr$whs <- max(crosscorr$cross.correlation$x[crosscorr$cross.correlation$y >= whs.thresh]) | |
| 758 cat("Window half size",crosscorr$whs,"\n",file=stdout()) | |
| 759 | |
| 760 # Compute phantom peak coefficient | |
| 761 ph.peakidx <- which( ( crosscorr$cross.correlation$x >= ( chip.data$read.length - round(2*iparams$sep.range[2]) ) ) & | |
| 762 ( crosscorr$cross.correlation$x <= ( chip.data$read.length + round(1.5*iparams$sep.range[2]) ) ) ) | |
| 763 ph.peakidx <- ph.peakidx[ which.max(crosscorr$cross.correlation$y[ph.peakidx]) ] | |
| 764 crosscorr$phantom.cc <- crosscorr$cross.correlation[ph.peakidx,] | |
| 765 cat("Phantom peak location",crosscorr$phantom.cc$x,"\n",file=stdout()) | |
| 766 cat("Phantom peak Correlation",crosscorr$phantom.cc$y,"\n",file=stdout()) | |
| 767 crosscorr$phantom.coeff <- crosscorr$peak$y / crosscorr$phantom.cc$y | |
| 768 crosscorr$phantom.coeff <- crosscorr$peak$y / crosscorr$min.cc$y | |
| 769 cat("Normalized cross-correlation coefficient (NCCC)",crosscorr$phantom.coeff,"\n",file=stdout()) | |
| 770 crosscorr$rel.phantom.coeff <- (crosscorr$peak$y - crosscorr$min.cc$y) / (crosscorr$phantom.cc$y - crosscorr$min.cc$y) | |
| 771 cat("Relative Cross correlation Coefficient (RCCC)",crosscorr$rel.phantom.coeff,"\n",file=stdout()) | |
| 772 crosscorr$phantom.quality.tag <- NA | |
| 773 if ( (crosscorr$rel.phantom.coeff >= 0) & (crosscorr$rel.phantom.coeff < 0.25) ) { | |
| 774 crosscorr$phantom.quality.tag <- -2 | |
| 775 } else if ( (crosscorr$rel.phantom.coeff >= 0.25) & (crosscorr$rel.phantom.coeff < 0.5) ) { | |
| 776 crosscorr$phantom.quality.tag <- -1 | |
| 777 } else if ( (crosscorr$rel.phantom.coeff >= 0.5) & (crosscorr$rel.phantom.coeff < 1) ) { | |
| 778 crosscorr$phantom.quality.tag <- 0 | |
| 779 } else if ( (crosscorr$rel.phantom.coeff >= 1) & (crosscorr$rel.phantom.coeff < 1.5) ) { | |
| 780 crosscorr$phantom.quality.tag <- 1 | |
| 781 } else if ( (crosscorr$rel.phantom.coeff >= 1.5) ) { | |
| 782 crosscorr$phantom.quality.tag <- 2 | |
| 783 } | |
| 784 cat("Phantom Peak Quality Tag",crosscorr$phantom.quality.tag,"\n",file=stdout()) | |
| 785 | |
| 786 # Output result to result file if required | |
| 787 #Filename\tnumReads\tPeak_shift\tPeak_Correlation\tRead_length\tPhantomPeak_Correlation\tMin_Correlation_Shift\tMin_Correlation\tNormalized_CrossCorrelation_Coefficient\tRelative_CrossCorrelation_Coefficient\tQualityTag) | |
| 788 if (! is.na(iparams$output.result.file)) { | |
| 789 cat(get.file.parts(iparams$chip.file)$fullname, | |
| 790 chip.data$num.tags, | |
| 791 paste(cc.peak$x,collapse=","), | |
| 792 paste(cc.peak$y,collapse=","), | |
| 793 crosscorr$phantom.cc$x, | |
| 794 crosscorr$phantom.cc$y, | |
| 795 crosscorr$min.cc$x, | |
| 796 crosscorr$min.cc$y, | |
| 797 crosscorr$phantom.coeff, | |
| 798 crosscorr$rel.phantom.coeff, | |
| 799 crosscorr$phantom.quality.tag, | |
| 800 sep="\t", | |
| 801 file=iparams$output.result.file, | |
| 802 append=TRUE) | |
| 803 cat("\n", | |
| 804 file=iparams$output.result.file, | |
| 805 append=TRUE) | |
| 806 } | |
| 807 | |
| 808 # Save figure if required | |
| 809 if (! is.na(iparams$output.plot.file)) { | |
| 810 pdf(file=iparams$output.plot.file,width=5,height=5) | |
| 811 par(mar = c(4,3.5,2,0.5), mgp = c(1.5,0.5,0), cex = 0.8); | |
| 812 plot(crosscorr$cross.correlation, | |
| 813 type='l', | |
| 814 xlab=sprintf("strand-shift (%s)",paste(cc.peak$x,collapse=",")), | |
| 815 ylab="cross-correlation") | |
| 816 abline(v=cc.peak$x,lty=2,col=2) | |
| 817 abline(v=crosscorr$phantom.cc$x,lty=2,col=4) | |
| 818 title(main=get.file.parts(iparams$chip.file)$fullname, | |
| 819 sub=sprintf("NSC=%g,RSC=%g,Qtag=%d",crosscorr$phantom.coeff,crosscorr$rel.phantom.coeff,crosscorr$phantom.quality.tag)) | |
| 820 dev.off(); | |
| 821 } | |
| 822 | |
| 823 # Save RData file if required | |
| 824 if (! is.na(iparams$output.rdata.file)) { | |
| 825 save(iparams, | |
| 826 crosscorr, | |
| 827 cc.peak, | |
| 828 file=iparams$output.rdata.file); | |
| 829 } | |
| 830 | |
| 831 # ################################# | |
| 832 # Call peaks | |
| 833 # ################################# | |
| 834 | |
| 835 if ( !is.na(iparams$output.npeak.file) || !is.na(iparams$output.rpeak.file) ) { | |
| 836 | |
| 837 # Remove local tag anomalies | |
| 838 cat('Removing read stacks\n',file=stdout()) | |
| 839 chip.data <- remove.local.tag.anomalies(chip.data$tags) | |
| 840 control.data <- remove.local.tag.anomalies(control.data$tags) | |
| 841 | |
| 842 # Open multiple processes if required | |
| 843 if (is.na(iparams$n.nodes)) { | |
| 844 cluster.nodes <- NULL | |
| 845 } else { | |
| 846 cluster.nodes <- makeCluster(iparams$n.nodes) | |
| 847 } | |
| 848 | |
| 849 # Find peaks | |
| 850 cat('Finding peaks\n',file=stdout()) | |
| 851 if (!is.na(iparams$npeak)) { | |
| 852 iparams$fdr <- 0.96 | |
| 853 } | |
| 854 narrow.peaks <- find.binding.positions(signal.data=chip.data,control.data=control.data,fdr=iparams$fdr,method=tag.lwcc,whs=crosscorr$whs,cluster=cluster.nodes) | |
| 855 if (!is.na(iparams$n.nodes)) { | |
| 856 stopCluster(cluster.nodes) | |
| 857 } | |
| 858 cat(paste("Detected",sum(unlist(lapply(narrow.peaks$npl,function(d) length(d$x)))),"peaks"),"\n",file=stdout()) | |
| 859 | |
| 860 # Write to narrowPeak file | |
| 861 if (!is.na(iparams$output.npeak.file)) { | |
| 862 write.narrowpeak.binding(narrow.peaks,iparams$output.npeak.file,margin=round(crosscorr$whs/2)) | |
| 863 #system(paste('gzip -f ',iparams$output.npeak.file)) | |
| 864 } | |
| 865 | |
| 866 # Compute and write regionPeak file | |
| 867 if (!is.na(iparams$output.rpeak.file)) { | |
| 868 region.peaks <- add.broad.peak.regions(chip.data,control.data,narrow.peaks,window.size=max(50,round(crosscorr$whs/4)),z.thr=10) | |
| 869 write.narrowpeak.binding(region.peaks,iparams$output.rpeak.file,margin=round(crosscorr$whs/2)) | |
| 870 #system(paste('gzip -f ',iparams$output.rpeak.file)) | |
| 871 } | |
| 872 | |
| 873 # Save Rdata file | |
| 874 if (! is.na(iparams$output.rdata.file)) { | |
| 875 save(iparams, | |
| 876 crosscorr, | |
| 877 cc.peak, | |
| 878 narrow.peaks, | |
| 879 region.peaks, | |
| 880 file=iparams$output.rdata.file); | |
| 881 } | |
| 882 | |
| 883 } | |
| 884 | |
| 885 | 
