0
|
1 # run_spp.R
|
|
2 # =============
|
|
3 # Author: Anshul Kundaje, Computer Science Dept., Stanford University
|
|
4 # Email: akundaje@stanford.edu
|
|
5 # Last updated: Oct 8, 2010
|
|
6 # =============
|
|
7 # MANDATORY ARGUMENTS
|
|
8 # -c=<ChIP_tagAlign/BAMFile>, full path and name of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)
|
|
9 # MANDATORY ARGUMENT FOR PEAK CALLING
|
|
10 # -i=<Input_tagAlign/BAMFile>, full path and name of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz)
|
|
11 # OPTIONAL ARGUMENTS
|
|
12 # -s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-100:5:600
|
|
13 # -speak=<strPeak>, user-defined cross-correlation peak strandshift
|
|
14 # -x=<min>:<max>, strand shifts to exclude (This is mainly to avoid phantom peaks) default=10:(readlen+10)
|
|
15 # -p=<nodes> , number of parallel processing nodes, default=NULL
|
|
16 # -fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling
|
|
17 # -npeak=<numPeaks>, threshold on number of peaks to call
|
|
18 # -tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used)
|
|
19 # -filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name
|
|
20 # OUTPUT PARAMETERS
|
|
21 # -odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used)
|
|
22 # -savn=<narrowpeakfilename> OR -savn NarrowPeak file name
|
|
23 # -savr=<regionpeakfilename> OR -savr RegionPeak file name
|
|
24 # -savd=<rdatafile> OR -savd , save Rdata file
|
|
25 # -savp=<plotdatafile> OR -savp , save cross-correlation plot
|
|
26 # -out=<resultfile>, append peakshift result to a file
|
|
27 # format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>phantomPeakCoef<tab>relPhantomPeakCoef<tab>QualityTag
|
|
28 # -rf , if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists
|
|
29 # -clean, if present will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files
|
|
30
|
|
31 args <- commandArgs(trailingOnly=TRUE); # Read Arguments from command line
|
|
32 nargs = length(args); # number of arguments
|
|
33
|
|
34 # ###########################################################################
|
|
35 # AUXILIARY FUNCTIONS
|
|
36 # ###########################################################################
|
|
37
|
|
38 print.usage <- function() {
|
|
39 # ===================================
|
|
40 # Function will print function usage
|
|
41 # ===================================
|
|
42 cat('Usage: Rscript run_spp.R <options>\n',file=stderr())
|
|
43 cat('MANDATORY ARGUMENTS\n',file=stderr())
|
|
44 cat('-c=<ChIP_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped)(FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) \n',file=stderr())
|
|
45 cat('MANDATORY ARGUMENTS FOR PEAK CALLING\n',file=stderr())
|
|
46 cat('-i=<Input_alignFile>, full path and name (or URL) of tagAlign/BAM file (can be gzipped) (FILE EXTENSION MUST BE tagAlign.gz, tagAlign, bam or bam.gz) \n',file=stderr())
|
|
47 cat('OPTIONAL ARGUMENTS\n',file=stderr())
|
|
48 cat('-s=<min>:<step>:<max> , strand shifts at which cross-correlation is evaluated, default=-100:5:600\n',file=stderr())
|
|
49 cat('-speak=<strPeak>, user-defined cross-correlation peak strandshift\n',file=stderr())
|
|
50 cat('-x=<min>:<max>, strand shifts to exclude (This is mainly to avoid region around phantom peak) default=10:(readlen+10)\n',file=stderr())
|
|
51 cat('-p=<nodes> , number of parallel processing nodes, default=0\n',file=stderr())
|
|
52 cat('-fdr=<falseDisoveryRate> , false discovery rate threshold for peak calling\n',file=stderr())
|
|
53 cat('-npeak=<numPeaks>, threshold on number of peaks to call\n',file=stderr())
|
|
54 cat('-tmpdir=<tempdir> , Temporary directory (if not specified R function tempdir() is used)\n',file=stderr())
|
|
55 cat('-filtchr=<chrnamePattern> , Pattern to use to remove tags that map to specific chromosomes e.g. _ will remove all tags that map to chromosomes with _ in their name\n',file=stderr())
|
|
56 cat('OUTPUT ARGUMENTS\n',file=stderr())
|
|
57 cat('-odir=<outputDirectory> name of output directory (If not set same as ChIP file directory is used)\n',file=stderr())
|
|
58 cat('-savn=<narrowpeakfilename> OR -savn NarrowPeak file name (fixed width peaks)\n',file=stderr())
|
|
59 cat('-savr=<regionpeakfilename> OR -savr RegionPeak file name (variable width peaks with regions of enrichment)\n',file=stderr())
|
|
60 cat('-savd=<rdatafile> OR -savd, save Rdata file\n',file=stderr())
|
|
61 cat('-savp=<plotdatafile> OR -savp, save cross-correlation plot\n',file=stderr())
|
|
62 cat('-out=<resultfile>, append peakshift/phantomPeak results to a file\n',file=stderr())
|
|
63 cat(' format:Filename<tab>numReads<tab>estFragLen<tab>corr_estFragLen<tab>PhantomPeak<tab>corr_phantomPeak<tab>argmin_corr<tab>min_corr<tab>phantomPeakCoef<tab>relPhantomPeakCoef<tab>QualityTag)\n',file=stderr())
|
|
64 cat('-rf, if plot or rdata or narrowPeak file exists replace it. If not used then the run is aborted if the plot or Rdata or narrowPeak file exists\n',file=stderr())
|
|
65 cat('-clean, if present will remove the original chip and control files after reading them in. CAUTION: Use only if the script calling run_spp.R is creating temporary files\n',file=stderr())
|
|
66 } # end: print.usage()
|
|
67
|
|
68 get.file.parts <- function(file.fullpath) {
|
|
69 # ===================================
|
|
70 # Function will take a file name with path and split the file name into
|
|
71 # path, fullname, name and ext
|
|
72 # ===================================
|
|
73 if (! is.character(file.fullpath)) {
|
|
74 stop('File name must be a string')
|
|
75 }
|
|
76
|
|
77 file.parts <- strsplit(as.character(file.fullpath), .Platform$file.sep, fixed=TRUE)[[1]] # split on file separator
|
|
78
|
|
79 if (length(file.parts) == 0) { # if empty file name
|
|
80 return(list(path='',
|
|
81 fullname='',
|
|
82 name='',
|
|
83 ext='')
|
|
84 )
|
|
85 } else {
|
|
86 if (length(file.parts) == 1) { # if no path then just the file name itself
|
|
87 file.path <- '.'
|
|
88 file.fullname <- file.parts
|
|
89 } else {
|
|
90 file.path <- paste(file.parts[1:(length(file.parts)-1)], collapse=.Platform$file.sep) # 1:last-1 token is path
|
|
91 file.fullname <- file.parts[length(file.parts)] # last token is filename
|
|
92 }
|
|
93 file.fullname.parts <- strsplit(file.fullname,'.',fixed=TRUE)[[1]] # split on .
|
|
94 if (length(file.fullname.parts) == 1) { # if no extension
|
|
95 file.ext <- ''
|
|
96 file.name <- file.fullname.parts
|
|
97 } else {
|
|
98 file.ext <- paste('.', file.fullname.parts[length(file.fullname.parts)], sep="") # add the . to the last token
|
|
99 file.name <- paste(file.fullname.parts[1:(length(file.fullname.parts)-1)], collapse=".")
|
|
100 }
|
|
101 return(list(path=file.path,
|
|
102 fullname=file.fullname,
|
|
103 name=file.name,
|
|
104 ext=file.ext))
|
|
105 }
|
|
106 } # end: get.file.parts()
|
|
107
|
|
108 parse.arguments <- function(args) {
|
|
109 # ===================================
|
|
110 # Function will parse arguments
|
|
111 # ===================================
|
|
112 # Set arguments to default values
|
|
113 chip.file <- NA # main ChIP tagAlign/BAM file name
|
|
114 isurl.chip.file <- FALSE # flag indicating whether ChIP file is a URL
|
|
115 control.file <- NA # control tagAlign/BAM file name
|
|
116 isurl.control.file <- FALSE # flag indicating whether control file is a URL
|
|
117 sep.min <- -100 # min strand shift
|
|
118 sep.max <- 600 # max strand shift
|
|
119 sep.bin <- 5 # increment for strand shift
|
|
120 sep.peak <- NA # user-defined peak shift
|
|
121 exclude.min <- 10 # lowerbound of strand shift exclusion region
|
|
122 exclude.max <- NaN # upperbound of strand shift exclusion region
|
|
123 n.nodes <- NA # number of parallel processing nodes
|
|
124 fdr <- 0.01 # false discovery rate threshold for peak calling
|
|
125 npeak <- NA # threshold on number of peaks to call
|
|
126 temp.dir <- tempdir() # temporary directory
|
|
127 chrname.rm.pattern <- NA # chromosome name pattern used to remove tags
|
|
128 output.odir <- NA # Output directory name
|
|
129 output.npeak.file <- NA # Output narrowPeak file name
|
|
130 output.rpeak.file <- NA # Output regionPeak file name
|
|
131 output.rdata.file <- NA # Rdata file
|
|
132 output.plot.file <- NA # cross correlation plot file
|
|
133 output.result.file <- NA # result file
|
|
134 replace.flag <- FALSE # replace file flag
|
|
135 clean.files.flag <- FALSE # file deletion flag
|
|
136
|
|
137 # Parse arguments
|
|
138 for (each.arg in args) {
|
|
139
|
|
140 if (grepl('^-c=',each.arg)) { #-c=<chip.file>
|
|
141
|
|
142 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
143 if (! is.na(arg.split[2]) ) {
|
|
144 chip.file <- arg.split[2] # second part is chip.file
|
|
145 } else {
|
|
146 stop('No tagAlign/BAM file name provided for parameter -c=')
|
|
147 }
|
|
148
|
|
149 } else if (grepl('^-i=',each.arg)) { #-i=<control.file>
|
|
150
|
|
151 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
152 if (! is.na(arg.split[2]) ) {
|
|
153 control.file <- arg.split[2] # second part is control.file
|
|
154 } else {
|
|
155 stop('No tagAlign/BAM file name provided for parameter -i=')
|
|
156 }
|
|
157
|
|
158 } else if (grepl('^-s=',each.arg)) { #-s=<sep.min>:<sep.bin>:<sep.max>
|
|
159
|
|
160 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
161 if (! is.na(arg.split[2]) ) {
|
|
162 sep.vals <- arg.split[2] # second part is sepmin:sepbin:sepmax
|
|
163 sep.vals.split <- strsplit(sep.vals,':',fixed=TRUE)[[1]] # split on :
|
|
164 if (length(sep.vals.split) != 3) { # must have 3 parts
|
|
165 stop('Strand shift limits must be specified as -s=sepmin:sepbin:sepmax')
|
|
166 } else {
|
|
167 if (any(is.na(as.numeric(sep.vals.split)))) { # check that sep vals are numeric
|
|
168 stop('Strand shift limits must be numeric values')
|
|
169 }
|
|
170 sep.min <- round(as.numeric(sep.vals.split[1]))
|
|
171 sep.bin <- round(as.numeric(sep.vals.split[2]))
|
|
172 sep.max <- round(as.numeric(sep.vals.split[3]))
|
|
173 if ((sep.min > sep.max) || (sep.bin > (sep.max - sep.min)) || (sep.bin < 0)) {
|
|
174 stop('Illegal separation values -s=sepmin:sepbin:sepmax')
|
|
175 }
|
|
176 }
|
|
177 } else {
|
|
178 stop('Strand shift limits must be specified as -s=sepmin:sepbin:sepmax')
|
|
179 }
|
|
180
|
|
181 } else if (grepl('^-speak=',each.arg)) { #-speak=<sep.peak> , user-defined cross-correlation peak strandshift
|
|
182
|
|
183 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
184 if (! is.na(arg.split[2]) ) {
|
|
185 sep.peak <- arg.split[2] # second part is <sep.peak>
|
|
186 if (is.na(as.numeric(sep.peak))) { # check that sep.peak is numeric
|
|
187 stop('-speak=<sep.peak>: User defined peak shift must be numeric')
|
|
188 }
|
|
189 sep.peak <- as.numeric(sep.peak)
|
|
190 } else {
|
|
191 stop('User defined peak shift must be provided as -speak=<sep.peak>')
|
|
192 }
|
|
193
|
|
194 } else if (grepl('^-x=',each.arg)) { #-x=<exclude.min>:<exclude.max>
|
|
195
|
|
196 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
197 if (! is.na(arg.split[2]) ) {
|
|
198 exclude.vals <- arg.split[2] # second part is excludemin:excludemax
|
|
199 exclude.vals.split <- strsplit(exclude.vals,':',fixed=TRUE)[[1]] # split on :
|
|
200 if (length(exclude.vals.split) != 2) { # must have 2 parts
|
|
201 stop('Exclusion limits must be specified as -x=excludemin:excludemax')
|
|
202 } else {
|
|
203 if (any(is.na(as.numeric(exclude.vals.split)))) { # check that exclude vals are numeric
|
|
204 stop('Exclusion limits must be numeric values')
|
|
205 }
|
|
206 exclude.min <- round(as.numeric(exclude.vals.split[1]))
|
|
207 exclude.max <- round(as.numeric(exclude.vals.split[2]))
|
|
208 if (exclude.min > exclude.max) {
|
|
209 stop('Illegal exclusion limits -x=excludemin:excludemax')
|
|
210 }
|
|
211 }
|
|
212 } else {
|
|
213 stop('Exclusion limits must be specified as -x=excludemin:excludemax')
|
|
214 }
|
|
215
|
|
216 } else if (grepl('^-p=',each.arg)) { #-p=<n.nodes> , number of parallel processing nodes, default=NULL
|
|
217
|
|
218 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
219 if (! is.na(arg.split[2]) ) {
|
|
220 n.nodes <- arg.split[2] # second part is numnodes
|
|
221 if (is.na(as.numeric(n.nodes))) { # check that n.nodes is numeric
|
|
222 stop('-p=<numnodes>: numnodes must be numeric')
|
|
223 }
|
|
224 n.nodes <- round(as.numeric(n.nodes))
|
|
225 } else {
|
|
226 stop('Number of parallel nodes must be provided as -p=<numnodes>')
|
|
227 }
|
|
228
|
|
229 } else if (grepl('^-fdr=',each.arg)) { #-fdr=<fdr> , false discovery rate, default=0.01
|
|
230
|
|
231 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
232 if (! is.na(arg.split[2]) ) {
|
|
233 fdr <- arg.split[2] # second part is fdr
|
|
234 if (is.na(as.numeric(fdr))) { # check that fdr is numeric
|
|
235 stop('-fdr=<falseDiscoveryRate>: false discovery rate must be numeric')
|
|
236 }
|
|
237 fdr <- as.numeric(fdr)
|
|
238 } else {
|
|
239 stop('False discovery rate must be provided as -fdr=<fdr>')
|
|
240 }
|
|
241
|
|
242 } else if (grepl('^-npeak=',each.arg)) { #-npeak=<numPeaks> , number of peaks threshold, default=NA
|
|
243
|
|
244 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
245 if (! is.na(arg.split[2]) ) {
|
|
246 npeak <- arg.split[2] # second part is npeak
|
|
247 if (is.na(as.numeric(npeak))) { # check that npeak is numeric
|
|
248 stop('-npeak=<numPeaks>: threshold on number of peaks must be numeric')
|
|
249 }
|
|
250 npeak <- round(as.numeric(npeak))
|
|
251 } else {
|
|
252 stop('Threshold on number of peaks must be provided as -npeak=<numPeaks>')
|
|
253 }
|
|
254
|
|
255 } else if (grepl('^-tmpdir=',each.arg)) { #-tmpdir=<temp.dir>
|
|
256
|
|
257 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
258 if (! is.na(arg.split[2]) ) {
|
|
259 temp.dir <- arg.split[2] # second part is temp.dir
|
|
260 } else {
|
|
261 stop('No temporary directory provided for parameter -tmpdir=')
|
|
262 }
|
|
263
|
|
264 } else if (grepl('^-filtchr=',each.arg)) { #-filtchr=<chrname.rm.pattern>
|
|
265
|
|
266 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
267 if (! is.na(arg.split[2]) ) {
|
|
268 chrname.rm.pattern <- arg.split[2] # second part is chrname.rm.pattern
|
|
269 } else {
|
|
270 stop('No pattern provided for parameter -filtchr=')
|
|
271 }
|
|
272
|
|
273 } else if (grepl('^-odir=',each.arg)) { #-odir=<output.odir>
|
|
274
|
|
275 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
276 if (! is.na(arg.split[2]) ) {
|
|
277 output.odir <- arg.split[2] # second part is output.odir
|
|
278 } else {
|
|
279 stop('No output directory provided for parameter -odir=')
|
|
280 }
|
|
281
|
|
282 } else if (grepl('^-savn',each.arg)) { # -savn=<output.npeak.file> OR -savn , save narrowpeak
|
|
283
|
|
284 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
285 if (! is.na(arg.split[2])) {
|
|
286 output.npeak.file <- arg.split[2] #-savn=
|
|
287 } else if (each.arg=='-savn') {
|
|
288 output.npeak.file <- NULL # NULL indicates get the name from the main file name
|
|
289 } else {
|
|
290 stop('Argument for saving narrowPeak file must be -savn or -savn=<filename>')
|
|
291 }
|
|
292
|
|
293 } else if (grepl('^-savr',each.arg)) { # -savr=<output.rpeak.file> OR -savr , save regionpeak
|
|
294
|
|
295 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
296 if (! is.na(arg.split[2])) {
|
|
297 output.rpeak.file <- arg.split[2] #-savr=
|
|
298 } else if (each.arg=='-savr') {
|
|
299 output.rpeak.file <- NULL # NULL indicates get the name from the main file name
|
|
300 } else {
|
|
301 stop('Argument for saving regionPeak file must be -savr or -savr=<filename>')
|
|
302 }
|
|
303
|
|
304 } else if (grepl('^-savd',each.arg)) { # -savd=<output.rdata.file> OR -savd , save Rdata file
|
|
305
|
|
306 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
307 if (! is.na(arg.split[2])) {
|
|
308 output.rdata.file <- arg.split[2] #-savd=
|
|
309 } else if (each.arg=='-savd') {
|
|
310 output.rdata.file <- NULL # NULL indicates get the name from the main file name
|
|
311 } else {
|
|
312 stop('Argument for saving Rdata file must be -savd or -savd=<filename>')
|
|
313 }
|
|
314
|
|
315 } else if (grepl('^-savp',each.arg)) { # -savp=<output.plot.file> OR -savp , save cross-correlation plot
|
|
316
|
|
317 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
318 if (! is.na(arg.split[2])) {
|
|
319 output.plot.file <- arg.split[2] #-savp=
|
|
320 } else if (each.arg=='-savp') {
|
|
321 output.plot.file <- NULL # NULL indicates get the name from the main file name
|
|
322 } else {
|
|
323 stop('Argument for saving Rdata file must be -savp or -savp=<filename>')
|
|
324 }
|
|
325
|
|
326 } else if (grepl('^-out=',each.arg)) { #-out=<output.result.file>
|
|
327
|
|
328 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
|
|
329 if (! is.na(arg.split[2]) ) {
|
|
330 output.result.file <- arg.split[2] # second part is output.result.file
|
|
331 } else {
|
|
332 stop('No result file provided for parameter -out=')
|
|
333 }
|
|
334
|
|
335 } else if (each.arg == '-rf') {
|
|
336
|
|
337 replace.flag <- TRUE
|
|
338
|
|
339 } else if (each.arg == '-clean') {
|
|
340
|
|
341 clean.files.flag <- TRUE
|
|
342
|
|
343 } else {
|
|
344
|
|
345 stop('Illegal argument ',each.arg)
|
|
346 }
|
|
347 }
|
|
348 # End: for loop
|
|
349
|
|
350 # Check mandatory arguments
|
|
351 if (is.na(chip.file)) {
|
|
352 stop('-c=<tagAlign/BAMFileName> is a mandatory argument')
|
|
353 }
|
|
354
|
|
355 if (is.na(control.file) && ! is.na(output.npeak.file)) {
|
|
356 stop('-i=<tagAlign/BAMFileName> is required for peak calling')
|
|
357 }
|
|
358
|
|
359 # Check if ChIP and control files are URLs
|
|
360 if (grepl('^http://',chip.file)) {
|
|
361 isurl.chip.file <- TRUE
|
|
362 }
|
|
363 if (grepl('^http://',control.file)) {
|
|
364 isurl.control.file <- TRUE
|
|
365 }
|
|
366
|
|
367 # If ChIP file is a URL output.odir MUST be specified
|
|
368 if (isurl.chip.file && is.na(output.odir)) {
|
|
369 stop('If ChIP file is a URL, then output directory MUST be specified')
|
|
370 }
|
|
371
|
|
372 # Check that ChIP and control files exist
|
|
373 if (isurl.chip.file) {
|
|
374 if (system(paste('wget -q --spider',chip.file)) != 0) {
|
|
375 stop('ChIP file URL not valid: ',chip.file)
|
|
376 }
|
|
377 } else if (!file.exists(chip.file)) {
|
|
378 stop('ChIP File:',chip.file,' does not exist')
|
|
379 }
|
|
380
|
|
381 if (!is.na(control.file)) {
|
|
382 if (isurl.control.file) {
|
|
383 if (system(paste('wget -q --spider',control.file)) != 0) {
|
|
384 stop('Control file URL not valid: ',control.file)
|
|
385 }
|
|
386 } else if (!file.exists(control.file)) {
|
|
387 stop('Control File:',control.file,' does not exist')
|
|
388 }
|
|
389 }
|
|
390
|
|
391 # Correct other arguments
|
|
392 if (is.na(output.odir)) { # Reconstruct output.odir if not provided
|
|
393 output.odir <- get.file.parts(chip.file)$path
|
|
394 }
|
|
395
|
|
396 if (is.null(output.npeak.file)) { # Reconstruct output.npeak.file if NULL
|
|
397 output.npeak.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '_VS_', get.file.parts(control.file)$name,'.narrowPeak', sep=""))
|
|
398 }
|
|
399
|
|
400 if (is.null(output.rpeak.file)) { # Reconstruct output.rpeak.file if NULL
|
|
401 output.rpeak.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '_VS_', get.file.parts(control.file)$name,'.regionPeak', sep=""))
|
|
402 }
|
|
403
|
|
404 if (is.null(output.rdata.file)) { # Reconstruct output.rdata.file if NULL
|
|
405 output.rdata.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '.Rdata', sep=""))
|
|
406 }
|
|
407
|
|
408 if (is.null(output.plot.file)) { # Reconstruct output.plot.file if NULL
|
|
409 output.plot.file <- file.path(output.odir, paste(get.file.parts(chip.file)$name, '.pdf', sep=""))
|
|
410 }
|
|
411
|
|
412 return(list(chip.file=chip.file,
|
|
413 isurl.chip.file=isurl.chip.file,
|
|
414 control.file=control.file,
|
|
415 isurl.control.file=isurl.control.file,
|
|
416 sep.range=c(sep.min,sep.bin,sep.max),
|
|
417 sep.peak=sep.peak,
|
|
418 ex.range=c(exclude.min,exclude.max),
|
|
419 n.nodes=n.nodes,
|
|
420 fdr=fdr,
|
|
421 npeak=npeak,
|
|
422 temp.dir=temp.dir,
|
|
423 chrname.rm.pattern=chrname.rm.pattern,
|
|
424 output.odir=output.odir,
|
|
425 output.npeak.file=output.npeak.file,
|
|
426 output.rpeak.file=output.rpeak.file,
|
|
427 output.rdata.file=output.rdata.file,
|
|
428 output.plot.file=output.plot.file,
|
|
429 output.result.file=output.result.file,
|
|
430 replace.flag=replace.flag,
|
|
431 clean.files.flag=clean.files.flag))
|
|
432 } # end: parse.arguments()
|
|
433
|
|
434 read.align <- function(align.filename) {
|
|
435 # ===================================
|
|
436 # Function will read a tagAlign or BAM file
|
|
437 # ===================================
|
|
438 if (grepl('(\\.bam)?.*(\\.tagAlign)',align.filename)) { # if tagalign file
|
|
439 chip.data <- read.tagalign.tags(align.filename)
|
|
440 # get readlength info
|
|
441 tmpDataRows <- read.table(align.filename,nrows=500)
|
|
442 chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2))
|
|
443 } else if (grepl('(\\.tagAlign)?.*(\\.bam)',align.filename)) { # if bam file
|
|
444 # create BAM file name
|
|
445 bam2align.filename <- sub('\\.bam','.tagAlign',align.filename)
|
|
446 # generate command to convert bam to tagalign
|
|
447 command <- vector(length=2)
|
|
448 command[1] <- sprintf("samtools view -F 0x0204 -o - %s",align.filename)
|
|
449 command[2] <- paste("awk 'BEGIN{FS=" , '"\t"' , ";OFS=", '"\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}', "' 1> ", bam2align.filename, sep="")
|
|
450 # command[2] <- paste("awk 'BEGIN{OFS=", '"\t"} {if (and($2,16) > 0) {print $3,($4-1),($4-1+length($10)),"N","1000","-"} else {print $3,($4-1),($4-1+length($10)),"N","1000","+"}}', "' 1> ", bam2align.filename, sep="")
|
|
451 command <- paste(command,collapse=" | ")
|
|
452 # Run command
|
|
453 status <- system(command,intern=FALSE,ignore.stderr=FALSE)
|
|
454 if ((status != 0) || !file.exists(bam2align.filename)) {
|
|
455 cat(sprintf("Error converting BAM to tagalign file: %s\n",align.filename),file=stderr())
|
|
456 q(save="no",status=1)
|
|
457 }
|
|
458 # read converted BAM file
|
|
459 chip.data <- read.tagalign.tags(bam2align.filename)
|
|
460 # get readlength info
|
|
461 tmpDataRows <- read.table(bam2align.filename,nrows=500)
|
|
462 chip.data$read.length <- round(median(tmpDataRows$V3 - tmpDataRows$V2))
|
|
463 # delete temporary tagalign file
|
|
464 file.remove(bam2align.filename)
|
|
465 } else {
|
|
466 cat(sprintf("Error:Unknown file format for file:%s\n",align.fname),file=stderr())
|
|
467 q(save="no",status=1)
|
|
468 }
|
|
469 return(chip.data)
|
|
470 } # end: read.align()
|
|
471
|
|
472 print.run.params <- function(params){
|
|
473 # ===================================
|
|
474 # Output run parameters
|
|
475 # ===================================
|
|
476 cat('################\n',file=stdout())
|
|
477 cat(iparams$chip.file,
|
|
478 iparams$control.file,
|
|
479 iparams$sep.range,
|
|
480 iparams$sep.peak,
|
|
481 iparams$ex.range,
|
|
482 iparams$n.nodes,
|
|
483 iparams$fdr,
|
|
484 iparams$npeak,
|
|
485 iparams$output.odir,
|
|
486 iparams$output.npeak.file,
|
|
487 iparams$output.rpeak.file,
|
|
488 iparams$output.rdata.file,
|
|
489 iparams$output.plot.file,
|
|
490 iparams$output.result.file,
|
|
491 iparams$replace.flag,
|
|
492 labels=c('ChIP data:','Control data:', 'strandshift(min):','strandshift(step):','strandshift(max)','user-defined peak shift',
|
|
493 'exclusion(min):','exclusion(max):','num parallel nodes:','FDR threshold:','NumPeaks Threshold:','Output Directory:',
|
|
494 'narrowPeak output file name:', 'regionPeak output file name:', 'Rdata filename:',
|
|
495 'plot pdf filename:','result filename:','Overwrite files?:'),
|
|
496 fill=18,
|
|
497 file=stdout())
|
|
498 cat('\n',file=stdout())
|
|
499 } # end: print.run.parameters()
|
|
500
|
|
501 check.replace.flag <- function(params){
|
|
502 # ===================================
|
|
503 # Check if files exist
|
|
504 # ===================================
|
|
505 # If replace.flag is NOT set, check if output files exist and abort if necessary
|
|
506 if (! iparams$replace.flag) {
|
|
507 if (! is.na(iparams$output.npeak.file)) {
|
|
508 if (file.exists(iparams$output.npeak.file)) {
|
|
509 cat('narrowPeak file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
|
|
510 q(save="no",status=1)
|
|
511 }
|
|
512 }
|
|
513 if (! is.na(iparams$output.rpeak.file)) {
|
|
514 if (file.exists(iparams$output.rpeak.file)) {
|
|
515 cat('regionPeak file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
|
|
516 q(save="no",status=1)
|
|
517 }
|
|
518 }
|
|
519 if (! is.na(iparams$output.plot.file)) {
|
|
520 if (file.exists(iparams$output.plot.file)) {
|
|
521 cat('Plot file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
|
|
522 q(save="no",status=1)
|
|
523 }
|
|
524 }
|
|
525 if (! is.na(iparams$output.rdata.file)) {
|
|
526 if (file.exists(iparams$output.rdata.file)) {
|
|
527 cat('Rdata file already exists. Aborting Run. Use -rf if you want to overwrite\n',file=stderr())
|
|
528 q(save="no",status=1)
|
|
529 }
|
|
530 }
|
|
531 }
|
|
532 }
|
|
533
|
|
534 # #############################################################################
|
|
535 # MAIN FUNCTION
|
|
536 # #############################################################################
|
|
537
|
|
538 # Check number of arguments
|
|
539 minargs = 1;
|
|
540 maxargs = 17;
|
|
541 if (nargs < minargs | nargs > maxargs) {
|
|
542 print.usage()
|
|
543 q(save="no",status=1)
|
|
544 }
|
|
545
|
|
546 # Parse arguments
|
|
547 # iparams$chip.file
|
|
548 # iparams$isurl.chip.file
|
|
549 # iparams$control.file
|
|
550 # iparams$isurl.control.file
|
|
551 # iparams$sep.range
|
|
552 # iparams$sep.peak
|
|
553 # iparams$ex.range
|
|
554 # iparams$n.nodes
|
|
555 # iparams$fdr
|
|
556 # iparams$npeak
|
|
557 # iparams$temp.dir
|
|
558 # iparams$output.odir
|
|
559 # iparams$output.npeak.file
|
|
560 # iparams$output.rpeak.file
|
|
561 # iparams$output.rdata.file
|
|
562 # iparams$output.plot.file
|
|
563 # iparams$output.result.file
|
|
564 # iparams$replace.flag
|
|
565 # iparams$clean.files.flag
|
|
566 iparams <- parse.arguments(args)
|
|
567
|
|
568 # Print run parameters
|
|
569 print.run.params(iparams)
|
|
570
|
|
571 # Check if output files exist
|
|
572 check.replace.flag(iparams)
|
|
573
|
|
574 # curr.chip.file and curr.control.file always point to the original ChIP and control files on disk
|
|
575 # ta.chip.filename & ta.control.filename always point to the final but temporary versions of the ChIP and control files that will be passed to read.align
|
|
576
|
|
577 # Download ChIP and control files if necessary to temp.dir
|
|
578 if (iparams$isurl.chip.file) {
|
|
579 curr.chip.file <- file.path(iparams$temp.dir, get.file.parts(iparams$chip.file)$fullname) # file is downloaded to temp.dir. Has same name as URL suffix
|
|
580 cat('Downloading ChIP file:',iparams$chip.file,"\n",file=stdout())
|
|
581 if (system(paste('wget -N -q -P',iparams$temp.dir,iparams$chip.file)) != 0) {
|
|
582 stop('Error downloading ChIP file:',iparams$chip.file)
|
|
583 }
|
|
584 } else {
|
|
585 curr.chip.file <- iparams$chip.file # file is in original directory
|
|
586 }
|
|
587
|
|
588 if (iparams$isurl.control.file) {
|
|
589 curr.control.file <- file.path(iparams$temp.dir, get.file.parts(iparams$control.file)$fullname) # file is downloaded to temp.dir. Has same name as URL suffix
|
|
590 cat('Downloading control file:',iparams$control.file,"\n",file=stdout())
|
|
591 if (system(paste('wget -N -q -P',iparams$temp.dir,iparams$control.file)) != 0) {
|
|
592 stop('Error downloading Control file:',iparams$control.file)
|
|
593 }
|
|
594 } else {
|
|
595 curr.control.file <- iparams$control.file # file is in original directory
|
|
596 }
|
|
597
|
|
598 # unzip ChIP and input files if required AND copy to temp directory
|
|
599 if (get.file.parts(curr.chip.file)$ext == '.gz') {
|
|
600 ta.chip.filename <- tempfile(get.file.parts(curr.chip.file)$name, tmpdir=iparams$temp.dir) # unzip file to temp.dir/[filename with .gz removed][randsuffix]
|
|
601 cat('Decompressing ChIP file\n',file=stdout())
|
|
602 if (system(paste("gunzip -c",curr.chip.file,">",ta.chip.filename)) != 0) {
|
|
603 stop('Unable to decompress file:', iparams$chip.file)
|
|
604 }
|
|
605 if (iparams$clean.files.flag) { # Remove original file if clean.files.flag is set
|
|
606 file.remove(curr.chip.file)
|
|
607 }
|
|
608 } else {
|
|
609 ta.chip.filename <- tempfile(get.file.parts(curr.chip.file)$fullname, tmpdir=iparams$temp.dir)
|
|
610 if (iparams$clean.files.flag) {
|
|
611 file.rename(curr.chip.file,ta.chip.filename) # move file to temp.dir/[filename][randsuffix]
|
|
612 } else {
|
|
613 file.copy(curr.chip.file,ta.chip.filename) # copy file to temp.dir/[filename][randsuffix]
|
|
614 }
|
|
615 }
|
|
616
|
|
617 if (! is.na(iparams$control.file)) {
|
|
618 if (get.file.parts(curr.control.file)$ext == '.gz') {
|
|
619 ta.control.filename <- tempfile(get.file.parts(curr.control.file)$name, tmpdir=iparams$temp.dir) # unzip file to temp.dir/[filename with .gz removed][randsuffix]
|
|
620 cat('Decompressing control file\n',file=stdout())
|
|
621 if (system(paste("gunzip -c",curr.control.file,">",ta.control.filename)) != 0) {
|
|
622 stop('Unable to decompress file:', iparams$control.file)
|
|
623 }
|
|
624 if (iparams$clean.files.flag) { # Remove original file if clean.files.flag is set
|
|
625 file.remove(curr.control.file)
|
|
626 }
|
|
627 } else {
|
|
628 ta.control.filename <- tempfile(get.file.parts(curr.control.file)$fullname, tmpdir=iparams$temp.dir) # copy file to temp.dir/[filename][randsuffix]
|
|
629
|
|
630 if (iparams$clean.files.flag) {
|
|
631 file.rename(curr.control.file,ta.control.filename) # move file to temp.dir/[filename][randsuffix]
|
|
632 } else {
|
|
633 file.copy(curr.control.file,ta.control.filename) # copy file to temp.dir/[filename][randsuffix]
|
|
634 }
|
|
635 }
|
|
636 }
|
|
637
|
|
638 # Remove downloaded files
|
|
639 if (iparams$isurl.chip.file & file.exists(curr.chip.file)) {
|
|
640 file.remove(curr.chip.file)
|
|
641 }
|
|
642
|
|
643 if (! is.na(iparams$control.file)) {
|
|
644 if (iparams$isurl.control.file & file.exists(curr.control.file)) {
|
|
645 file.remove(curr.control.file)
|
|
646 }
|
|
647 }
|
|
648
|
|
649 # Load SPP library
|
|
650 library(spp)
|
|
651
|
|
652 # Read ChIP tagAlign/BAM files
|
|
653 cat("Reading ChIP tagAlign/BAM file",iparams$chip.file,"\n",file=stdout())
|
|
654 chip.data <- read.align(ta.chip.filename)
|
|
655 cat("ChIP data read length",chip.data$read.length,"\n",file=stdout())
|
|
656 file.remove(ta.chip.filename) # Delete temporary file
|
|
657 if (length(chip.data$tags)==0) {
|
|
658 stop('Error in ChIP file format:', iparams$chip.file)
|
|
659 }
|
|
660 # Remove illegal chromosome names
|
|
661 if (! is.na(iparams$chrname.rm.pattern)) {
|
|
662 selectidx <- which(grepl(iparams$chrname.rm.pattern,names(chip.data$tags))==FALSE)
|
|
663 chip.data$tags <- chip.data$tags[selectidx]
|
|
664 chip.data$quality <- chip.data$quality[selectidx]
|
|
665 }
|
|
666 chip.data$num.tags <- sum(unlist(lapply(chip.data$tags,function(d) length(d))))
|
|
667
|
|
668 # Read Control tagAlign/BAM files
|
|
669 if (! is.na(iparams$control.file)) {
|
|
670 cat("Reading Control tagAlign/BAM file",iparams$control.file,"\n",file=stdout())
|
|
671 control.data <- read.align(ta.control.filename)
|
|
672 file.remove(ta.control.filename) # Delete temporary file
|
|
673 if (length(control.data$tags)==0) {
|
|
674 stop('Error in control file format:', iparams$chip.file)
|
|
675 }
|
|
676 cat("Control data read length",control.data$read.length,"\n",file=stdout())
|
|
677 # Remove illegal chromosome names
|
|
678 if (! is.na(iparams$chrname.rm.pattern)) {
|
|
679 selectidx <- which(grepl(iparams$chrname.rm.pattern,names(control.data$tags))==FALSE)
|
|
680 control.data$tags <- control.data$tags[selectidx]
|
|
681 control.data$quality <- control.data$quality[selectidx]
|
|
682 }
|
|
683 control.data$num.tags <- sum(unlist(lapply(control.data$tags,function(d) length(d))))
|
|
684 }
|
|
685
|
|
686 # Open multiple processes if required
|
|
687 if (is.na(iparams$n.nodes)) {
|
|
688 cluster.nodes <- NULL
|
|
689 } else {
|
|
690 library(snow)
|
|
691 cluster.nodes <- makeCluster(iparams$n.nodes)
|
|
692 }
|
|
693
|
|
694 # #################################
|
|
695 # Calculate cross-correlation for various strand shifts
|
|
696 # #################################
|
|
697 cat("Calculating peak characteristics\n",file=stdout())
|
|
698 # crosscorr
|
|
699 # $cross.correlation : Cross-correlation profile as an $x/$y data.frame
|
|
700 # $peak : Position ($x) and height ($y) of automatically detected cross-correlation peak.
|
|
701 # $whs: Optimized window half-size for binding detection (based on the width of the cross-correlation peak)
|
|
702 crosscorr <- get.binding.characteristics(chip.data,
|
|
703 srange=iparams$sep.range[c(1,3)],
|
|
704 bin=iparams$sep.range[2],
|
|
705 accept.all.tags=T,
|
|
706 cluster=cluster.nodes)
|
|
707 if (!is.na(iparams$n.nodes)) {
|
|
708 stopCluster(cluster.nodes)
|
|
709 }
|
|
710
|
|
711 # Smooth the cross-correlation curve if required
|
|
712 cc <- crosscorr$cross.correlation
|
|
713 crosscorr$min.cc <- crosscorr$cross.correlation[ which.min(crosscorr$cross.correlation$y) , ] # minimum value and shift of cross-correlation
|
|
714 cat("Minimum cross-correlation value", crosscorr$min.cc$y,"\n",file=stdout())
|
|
715 cat("Minimum cross-correlation shift", crosscorr$min.cc$x,"\n",file=stdout())
|
|
716 sbw <- 2*floor(ceiling(5/iparams$sep.range[2]) / 2) + 1 # smoothing bandwidth
|
|
717 cc$y <- runmean(cc$y,sbw,alg="fast")
|
|
718
|
|
719 # Compute cross-correlation peak
|
|
720 bw <- ceiling(2/iparams$sep.range[2]) # crosscorr[i] is compared to crosscorr[i+/-bw] to find peaks
|
|
721 peakidx <- (diff(cc$y,bw)>=0) # cc[i] > cc[i-bw]
|
|
722 peakidx <- diff(peakidx,bw)
|
|
723 peakidx <- which(peakidx==-1) + bw
|
|
724
|
|
725 # exclude peaks from the excluded region
|
|
726 if ( is.nan(iparams$ex.range[2]) ) {
|
|
727 iparams$ex.range[2] <- chip.data$read.length+10
|
|
728 }
|
|
729 peakidx <- peakidx[(cc$x[peakidx] < iparams$ex.range[1]) | (cc$x[peakidx] > iparams$ex.range[2])]
|
|
730 cc <- cc[peakidx,]
|
|
731
|
|
732 # Find max peak position and other peaks within 0.9*max_peakvalue that are further away from maxpeakposition
|
|
733 maxpeakidx <- which.max(cc$y)
|
|
734 maxpeakshift <- cc$x[maxpeakidx]
|
|
735 maxpeakval <- cc$y[maxpeakidx]
|
|
736 peakidx <-which((cc$y >= 0.9*maxpeakval) & (cc$x >= maxpeakshift))
|
|
737 cc <- cc[peakidx,]
|
|
738
|
|
739 # sort the peaks and get the top 3
|
|
740 sortidx <- order(cc$y,decreasing=TRUE)
|
|
741 sortidx <- sortidx[c(1:min(3,length(sortidx)))]
|
|
742 cc.peak <- cc[sortidx,]
|
|
743
|
|
744 # Override peak shift if user supplies peak shift
|
|
745 if (! is.na(iparams$sep.peak)) {
|
|
746 cc.peak <- approx(crosscorr$cross.correlation$x,crosscorr$cross.correlation$y,iparams$sep.peak,rule=2)
|
|
747 }
|
|
748 cat("Peak cross-correlation value", paste(cc.peak$y,collapse=","),"\n",file=stdout())
|
|
749 cat("Peak strand shift",paste(cc.peak$x,collapse=","),"\n",file=stdout())
|
|
750
|
|
751 # Reset values in crosscorr
|
|
752 crosscorr$peak$x <- cc.peak$x[1]
|
|
753 crosscorr$peak$y <- cc.peak$y[1]
|
|
754
|
|
755 # Compute window half size
|
|
756 whs.thresh <- crosscorr$min.cc$y + (crosscorr$peak$y - crosscorr$min.cc$y)/3
|
|
757 crosscorr$whs <- max(crosscorr$cross.correlation$x[crosscorr$cross.correlation$y >= whs.thresh])
|
|
758 cat("Window half size",crosscorr$whs,"\n",file=stdout())
|
|
759
|
|
760 # Compute phantom peak coefficient
|
|
761 ph.peakidx <- which( ( crosscorr$cross.correlation$x >= ( chip.data$read.length - round(2*iparams$sep.range[2]) ) ) &
|
|
762 ( crosscorr$cross.correlation$x <= ( chip.data$read.length + round(1.5*iparams$sep.range[2]) ) ) )
|
|
763 ph.peakidx <- ph.peakidx[ which.max(crosscorr$cross.correlation$y[ph.peakidx]) ]
|
|
764 crosscorr$phantom.cc <- crosscorr$cross.correlation[ph.peakidx,]
|
|
765 cat("Phantom peak location",crosscorr$phantom.cc$x,"\n",file=stdout())
|
|
766 cat("Phantom peak Correlation",crosscorr$phantom.cc$y,"\n",file=stdout())
|
|
767 crosscorr$phantom.coeff <- crosscorr$peak$y / crosscorr$phantom.cc$y
|
|
768 crosscorr$phantom.coeff <- crosscorr$peak$y / crosscorr$min.cc$y
|
|
769 cat("Normalized cross-correlation coefficient (NCCC)",crosscorr$phantom.coeff,"\n",file=stdout())
|
|
770 crosscorr$rel.phantom.coeff <- (crosscorr$peak$y - crosscorr$min.cc$y) / (crosscorr$phantom.cc$y - crosscorr$min.cc$y)
|
|
771 cat("Relative Cross correlation Coefficient (RCCC)",crosscorr$rel.phantom.coeff,"\n",file=stdout())
|
|
772 crosscorr$phantom.quality.tag <- NA
|
|
773 if ( (crosscorr$rel.phantom.coeff >= 0) & (crosscorr$rel.phantom.coeff < 0.25) ) {
|
|
774 crosscorr$phantom.quality.tag <- -2
|
|
775 } else if ( (crosscorr$rel.phantom.coeff >= 0.25) & (crosscorr$rel.phantom.coeff < 0.5) ) {
|
|
776 crosscorr$phantom.quality.tag <- -1
|
|
777 } else if ( (crosscorr$rel.phantom.coeff >= 0.5) & (crosscorr$rel.phantom.coeff < 1) ) {
|
|
778 crosscorr$phantom.quality.tag <- 0
|
|
779 } else if ( (crosscorr$rel.phantom.coeff >= 1) & (crosscorr$rel.phantom.coeff < 1.5) ) {
|
|
780 crosscorr$phantom.quality.tag <- 1
|
|
781 } else if ( (crosscorr$rel.phantom.coeff >= 1.5) ) {
|
|
782 crosscorr$phantom.quality.tag <- 2
|
|
783 }
|
|
784 cat("Phantom Peak Quality Tag",crosscorr$phantom.quality.tag,"\n",file=stdout())
|
|
785
|
|
786 # Output result to result file if required
|
|
787 #Filename\tnumReads\tPeak_shift\tPeak_Correlation\tRead_length\tPhantomPeak_Correlation\tMin_Correlation_Shift\tMin_Correlation\tNormalized_CrossCorrelation_Coefficient\tRelative_CrossCorrelation_Coefficient\tQualityTag)
|
|
788 if (! is.na(iparams$output.result.file)) {
|
|
789 cat(get.file.parts(iparams$chip.file)$fullname,
|
|
790 chip.data$num.tags,
|
|
791 paste(cc.peak$x,collapse=","),
|
|
792 paste(cc.peak$y,collapse=","),
|
|
793 crosscorr$phantom.cc$x,
|
|
794 crosscorr$phantom.cc$y,
|
|
795 crosscorr$min.cc$x,
|
|
796 crosscorr$min.cc$y,
|
|
797 crosscorr$phantom.coeff,
|
|
798 crosscorr$rel.phantom.coeff,
|
|
799 crosscorr$phantom.quality.tag,
|
|
800 sep="\t",
|
|
801 file=iparams$output.result.file,
|
|
802 append=TRUE)
|
|
803 cat("\n",
|
|
804 file=iparams$output.result.file,
|
|
805 append=TRUE)
|
|
806 }
|
|
807
|
|
808 # Save figure if required
|
|
809 if (! is.na(iparams$output.plot.file)) {
|
|
810 pdf(file=iparams$output.plot.file,width=5,height=5)
|
|
811 par(mar = c(4,3.5,2,0.5), mgp = c(1.5,0.5,0), cex = 0.8);
|
|
812 plot(crosscorr$cross.correlation,
|
|
813 type='l',
|
|
814 xlab=sprintf("strand-shift (%s)",paste(cc.peak$x,collapse=",")),
|
|
815 ylab="cross-correlation")
|
|
816 abline(v=cc.peak$x,lty=2,col=2)
|
|
817 abline(v=crosscorr$phantom.cc$x,lty=2,col=4)
|
|
818 title(main=get.file.parts(iparams$chip.file)$fullname,
|
|
819 sub=sprintf("NSC=%g,RSC=%g,Qtag=%d",crosscorr$phantom.coeff,crosscorr$rel.phantom.coeff,crosscorr$phantom.quality.tag))
|
|
820 dev.off();
|
|
821 }
|
|
822
|
|
823 # Save RData file if required
|
|
824 if (! is.na(iparams$output.rdata.file)) {
|
|
825 save(iparams,
|
|
826 crosscorr,
|
|
827 cc.peak,
|
|
828 file=iparams$output.rdata.file);
|
|
829 }
|
|
830
|
|
831 # #################################
|
|
832 # Call peaks
|
|
833 # #################################
|
|
834
|
|
835 if ( !is.na(iparams$output.npeak.file) || !is.na(iparams$output.rpeak.file) ) {
|
|
836
|
|
837 # Remove local tag anomalies
|
|
838 cat('Removing read stacks\n',file=stdout())
|
|
839 chip.data <- remove.local.tag.anomalies(chip.data$tags)
|
|
840 control.data <- remove.local.tag.anomalies(control.data$tags)
|
|
841
|
|
842 # Open multiple processes if required
|
|
843 if (is.na(iparams$n.nodes)) {
|
|
844 cluster.nodes <- NULL
|
|
845 } else {
|
|
846 cluster.nodes <- makeCluster(iparams$n.nodes)
|
|
847 }
|
|
848
|
|
849 # Find peaks
|
|
850 cat('Finding peaks\n',file=stdout())
|
|
851 if (!is.na(iparams$npeak)) {
|
|
852 iparams$fdr <- 0.96
|
|
853 }
|
|
854 narrow.peaks <- find.binding.positions(signal.data=chip.data,control.data=control.data,fdr=iparams$fdr,method=tag.lwcc,whs=crosscorr$whs,cluster=cluster.nodes)
|
|
855 if (!is.na(iparams$n.nodes)) {
|
|
856 stopCluster(cluster.nodes)
|
|
857 }
|
|
858 cat(paste("Detected",sum(unlist(lapply(narrow.peaks$npl,function(d) length(d$x)))),"peaks"),"\n",file=stdout())
|
|
859
|
|
860 # Write to narrowPeak file
|
|
861 if (!is.na(iparams$output.npeak.file)) {
|
|
862 write.narrowpeak.binding(narrow.peaks,iparams$output.npeak.file,margin=round(crosscorr$whs/2),npeaks=iparams$npeak)
|
|
863 system(paste('gzip -f ',iparams$output.npeak.file))
|
|
864 }
|
|
865
|
|
866 # Compute and write regionPeak file
|
|
867 if (!is.na(iparams$output.rpeak.file)) {
|
|
868 region.peaks <- add.broad.peak.regions(chip.data,control.data,narrow.peaks,window.size=max(50,round(crosscorr$whs/4)),z.thr=10)
|
|
869 write.narrowpeak.binding(region.peaks,iparams$output.rpeak.file,margin=round(crosscorr$whs/2),npeaks=iparams$npeak)
|
|
870 system(paste('gzip -f ',iparams$output.rpeak.file))
|
|
871 }
|
|
872
|
|
873 # Save Rdata file
|
|
874 if (! is.na(iparams$output.rdata.file)) {
|
|
875 save(iparams,
|
|
876 crosscorr,
|
|
877 cc.peak,
|
|
878 narrow.peaks,
|
|
879 region.peaks,
|
|
880 file=iparams$output.rdata.file);
|
|
881 }
|
|
882
|
|
883 }
|
|
884
|
|
885
|