comparison graphprot_predict_profile_wrapper.pl @ 0:215925e588c4 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit 9bb5f3c8ed8e87ec5652b5bc8bf9c774d5534a1a
author rnateam
date Fri, 25 May 2018 12:17:44 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:215925e588c4
1 #!/usr/bin/perl
2
3 use strict;
4 use warnings;
5 use Getopt::Long;
6 use Pod::Usage;
7 use Cwd qw(getcwd abs_path);
8 use List::Util qw(sum);
9
10 =head1 NAME
11
12 =head1 SYNOPSIS
13
14 Galaxy wrapper script for GraphProt (-action predict_profile) to compute
15 the binding profile for a given model on a given set of sequences provided
16 in FASTA format. After profile prediction, average profiles get computed,
17 scores signified and binding peak regions extracted. The score signification
18 is done using the provided fitted GEV parameters, either from .params file
19 or manually set. If score threshold is set (-thr-sc), p-value assignment
20 will be skipped and set score threshold will be used to extract peak
21 regions. NOTE: Additional lines .params file are used to store and get
22 GEV parameters, as well as type of model (model_type: sequence|structure).
23 Also, this wrapper currently works for classification mode only.
24
25 PARAMETERS:
26
27 -help|h display help page
28 -fasta Input FASTA file (option -fasta)
29 -model Input .model file (option -model)
30 -params Input .params file
31 NOTE: uses .params file with additional
32 parameters
33 Manually set parameters (below) will override
34 found settings in .params file
35 -data-id Data ID (option -prefix)
36
37 GraphProt model parameters (by default get from .params file):
38
39 -onlyseq Set if model is a sequence model
40 -R GraphProt model R parameter
41 -D GraphProt model D parameter
42 -epochs GraphProt model epochs parameter
43 -lambda GraphProt model lambda parameter
44 -bitsize GraphProt model bitsize parameter
45 -abstraction GraphProt model RNAshapes abstraction level
46 parameter (set for structure models)
47
48 Peak region extraction parameters:
49
50 -thr-sc Score threshold for extracting peak regions
51 By default p-value of 0.05 is used. If no p-value
52 calculation possible, -thr-sc is used with default: 0
53 -thr-p p-value threshold for extracting peak regions
54 By default, peak regions with p = 0.05 are extracted,
55 as well as p50 score peak regions (if info given)
56 Default: 0.05
57 -merge-dist Maximum merge distance for nearby peak regions
58 Default: report all non-overlapping regions
59 -p50-out Output p50 score filtered peak regions BED file
60 default: false
61
62 GEV distribution parameters:
63
64 -distr-my GEV distribution my parameter for calculating p-values
65 from scores
66 -distr-sigma GEV distrubution sigma parameter for calculating
67 p-values from scores
68 -distr-xi GEV distribution xi parameter for calculating p-values
69 from scores
70 -ap-extlr Used average profile left right extension for
71 averaging scores, which were used for distribution
72 fitting. NOTE: usually a value of 5 was used for
73 for getting GEV distribution and parameters. If you
74 choose a different value here, calculated p-values
75 will be wrong!
76 default : 5
77
78
79 =head1 DISCRIPTION
80
81 5) Write manual
82 6) add output p50 file with NOTE
83
84 6) put GP into rna_tools
85
86 NOTE:
87 Additional lines .params file used to store and get gev parameters, as well
88 as type of model (model_type: sequence|structure).
89
90 Example .params content:
91 epochs: 20
92 lambda: 0.001
93 R: 1
94 D: 4
95 bitsize: 14
96 model_type: sequence
97 #ADDITIONAL MODEL PARAMETERS
98 ap_extlr: 5
99 gev_my: -2.5408
100 gev_sigma: 1.6444
101 gev_xi: -0.1383
102 p50_score: 6.51534
103 p50_p_val: 0.0009059744
104
105 =cut
106
107 ############################
108 # COMMAND LINE CHECKING.
109 ############################
110
111 # Command line argument variables.
112 my ($i_help, $i_fasta, $i_model, $i_params, $i_data_id, $i_thr_sc, $i_thr_p, $i_max_merge_dist, $i_p50_out, $i_gp_r, $i_gp_d, $i_gp_epochs, $i_gp_lambda, $i_gp_bitsize, $i_gp_abstr, $i_gp_onlyseq, $i_distr_my, $i_distr_sigma, $i_distr_xi, $i_ap_extlr);
113
114 # Parse the command line.
115 GetOptions ( "help|h" => \$i_help,
116 "fasta:s" => \$i_fasta,
117 "model:s" => \$i_model,
118 "params:s" => \$i_params,
119 "data-id:s" => \$i_data_id,
120 "thr-sc:f" => \$i_thr_sc,
121 "thr-p:f" => \$i_thr_p,
122 "p50-out" => \$i_p50_out,
123 "merge-dist:i" => \$i_max_merge_dist,
124 "R:i" => \$i_gp_r,
125 "D:i" => \$i_gp_d,
126 "epochs:i" => \$i_gp_epochs,
127 "lambda:f" => \$i_gp_lambda,
128 "bitsize:i" => \$i_gp_bitsize,
129 "abstr:i" => \$i_gp_abstr,
130 "onlyseq" => \$i_gp_onlyseq,
131 "distr-my:f" => \$i_distr_my,
132 "distr-sigma:f" => \$i_distr_sigma,
133 "distr-xi:f" => \$i_distr_xi,
134 "ap-extlr:i" => \$i_ap_extlr ) or pod2usage(1);
135
136 # Check.
137 pod2usage(1) if $i_help;
138 ($i_fasta and $i_model) or pod2usage "ERROR: -fasta, -model are mandatory";
139 if ($i_distr_my or $i_distr_sigma or $i_distr_xi) {
140 ($i_distr_my and $i_distr_sigma and $i_distr_xi) or pod2usage "ERROR: expects all three distribution parameters to be set";
141 }
142
143 #######################
144 # SET PARAMETERS.
145 #######################
146
147 # Prefix.
148 my $data_id = "GraphProt";
149 if ($i_data_id) {
150 $data_id = $i_data_id;
151 }
152 my $thr_sc = 0;
153 if ($i_thr_sc) {
154 $thr_sc = $i_thr_sc;
155 }
156 my $thr_p = 0.05;
157 if ($i_thr_p) {
158 $thr_p = $i_thr_p;
159 }
160 my $ap_extlr = 5;
161 if ($i_ap_extlr) {
162 $ap_extlr = $i_ap_extlr;
163 }
164 my $max_merge_dist = 0;
165 if ($i_max_merge_dist) {
166 $max_merge_dist = $i_max_merge_dist;
167 }
168
169 # Get parameters from .params file.
170 my %params;
171 if ($i_params) {
172 open(IN, $i_params) or die "Cannot open $i_params: $!";
173 while(<IN>) {
174 next if ($_ =~ /^#/);
175 if ($_ =~ /(.+):\s(.+)/) {
176 $params{$1} = $2;
177 }
178 }
179 close IN;
180 }
181
182 # Create GP parameter string.
183 my $params_string = "";
184 # -onlyseq
185 my $model_type = "structure";
186 if (exists $params{"model_type"}) {
187 if ($params{"model_type"} eq "sequence") {
188 $params_string .= " -onlyseq";
189 $model_type = "sequence";
190 }
191 } elsif ($i_gp_onlyseq) {
192 $params_string .= " -onlyseq";
193 $model_type = "sequence";
194 }
195 # -R
196 if ($i_gp_r) {
197 $params_string .= " -R $i_gp_r";
198 } elsif (exists $params{"R"}) {
199 my $v = $params{"R"};
200 $params_string .= " -R $v";
201 } else {
202 die "ERROR: -R needs to be set";
203 }
204 # -D
205 if ($i_gp_d) {
206 $params_string .= " -D $i_gp_d";
207 } elsif (exists $params{"D"}) {
208 my $v = $params{"D"};
209 $params_string .= " -D $v";
210 } else {
211 die "ERROR: -D needs to be set";
212 }
213 # -epochs
214 if ($i_gp_epochs) {
215 $params_string .= " -epochs $i_gp_epochs";
216 } elsif (exists $params{"epochs"}) {
217 my $v = $params{"epochs"};
218 $params_string .= " -epochs $v";
219 } else {
220 die "ERROR: -epochs needs to be set";
221 }
222 # -lambda
223 if ($i_gp_lambda) {
224 $params_string .= " -lambda $i_gp_lambda";
225 } elsif (exists $params{"lambda"}) {
226 my $v = $params{"lambda"};
227 $params_string .= " -lambda $v";
228 } else {
229 die "ERROR: -lambda needs to be set";
230 }
231 # -bitsize
232 if ($i_gp_bitsize) {
233 $params_string .= " -bitsize $i_gp_bitsize";
234 } elsif (exists $params{"bitsize"}) {
235 my $v = $params{"bitsize"};
236 $params_string .= " -bitsize $v";
237 } else {
238 die "ERROR: -bitsize needs to be set";
239 }
240 # -abstraction
241 if ($i_gp_abstr) {
242 $params_string .= " -bitsize $i_gp_abstr";
243 die "ERROR: -abstraction set with -onlyseq" unless ($model_type eq "structure");
244 } elsif (exists $params{"abstraction"}) {
245 my $v = $params{"abstraction"};
246 $params_string .= " -abstraction $v";
247 die "ERROR: -abstraction set with -onlyseq" unless ($model_type eq "structure");
248 }
249
250 # Distribution parameters.
251 my ($distr_my, $distr_sigma, $distr_xi);
252 if ($i_distr_my) {
253 $distr_my = $i_distr_my;
254 } elsif (exists $params{"gev_my"}) {
255 $distr_my = $params{"gev_my"};
256 }
257 if ($i_distr_sigma) {
258 $distr_sigma = $i_distr_sigma;
259 } elsif (exists $params{"gev_sigma"}) {
260 $distr_sigma = $params{"gev_sigma"};
261 }
262 if ($i_distr_xi) {
263 $distr_xi = $i_distr_xi;
264 } elsif (exists $params{"gev_xi"}) {
265 $distr_xi = $params{"gev_xi"};
266 }
267 # Average profile extension parameter.
268 if (exists $params{"ap_extlr"}) {
269 $ap_extlr = $params{"ap_extlr"};
270 }
271
272 print STDOUT "model_type: $model_type\n";
273 print STDOUT "ap_extlr: $ap_extlr\n";
274
275 if ($distr_my and $distr_sigma and $distr_xi) {
276 print STDOUT "distr_my: $distr_my\n";
277 print STDOUT "distr_sigma: $distr_sigma\n";
278 print STDOUT "distr_xi: $distr_xi\n";
279 }
280 print STDOUT "max_merge_dist: $max_merge_dist\n";
281
282 # p50 filter score.
283 my $p50_sc;
284 if (exists $params{"p50_score"}) {
285 $p50_sc = $params{"p50_score"};
286 print STDOUT "p50_score: $p50_sc\n";
287 }
288 # p50 p-value.
289 my $p50_p;
290 if (exists $params{"p50_p_val"}) {
291 $p50_p = $params{"p50_p_val"};
292 print STDOUT "p50_p_val: $p50_p\n";
293 }
294
295
296
297 ##################################
298 # RUN GP PROFILE PREDICTION.
299 ##################################
300
301 # Read in FASTA file headers.
302 my @fasta_ids;
303 my $headers = qx/grep ">" $i_fasta/;
304 while ($headers =~ />(.+?)\n/g) {
305 push(@fasta_ids,$1);
306 }
307
308 # Run GP profile prediction.
309 my $gp_call = "GraphProt.pl -action predict_profile -model $i_model -fasta $i_fasta -prefix $data_id $params_string";
310 print STDOUT "GraphProt call: $gp_call\n";
311 # &> profile_prediction.log
312 qx/$gp_call/;
313
314
315 ####################################
316 # CALCULATE AVERAGE PROFILE.
317 ####################################
318
319 # Calculate .average_profile from GraphProt .profile file.
320 # Also add p-value column if distr parameters set.
321 my $add_p = 0;
322 if ($distr_my and $distr_sigma and $distr_xi) {
323 $add_p = 1;
324 }
325
326 # Average_profile: 1-based, FASTA headers as IDs, using ap_extlr for averaging scores.
327 my $profile = "$data_id.profile";
328 my $average_profile = "$data_id.average_profile";
329 # Calculate window size.
330 my $win = $ap_extlr * 2 + 1;
331
332 open (IN, $profile) or die "Cannot open $profile: $!\n";
333 open (OUT, '>', $average_profile) or die "Cannot open $average_profile: $!";
334
335 # Old ID.
336 my $old_id = "-";
337 # Current ID.
338 my $cur_id = "-";
339 # Start position of the window.
340 my $pos_inc = 0;
341 # Score array.
342 my @scores;
343 # Input row counter.
344 my $c_in = 0;
345 # Output row counter.
346 my $c_out = 0;
347
348 while (<IN>) {
349
350 if ($_ =~ /(.+?)\t\d+?\t(.+?)\n/) {
351
352 $cur_id = $1;
353 my $score = $2;
354 $c_in++;
355
356 # Case: New refseq ID / new paragraph.
357 if ($cur_id ne $old_id) {
358
359 # Print remaining entries at the end of former paragraph.
360 while (scalar(@scores) >= ($ap_extlr + 1)) {
361 # Calculate avg array score.
362 my $mean = array_mean(@scores);
363 $c_out++;
364 $pos_inc++;
365 if ($add_p) {
366 my $y = exp(-1*(1 + ( ($mean-$distr_my)/$distr_sigma )*$distr_xi)**(-1/$distr_xi));
367 my $p = sprintf("%.8f", (1-$y));
368 print OUT "$fasta_ids[$old_id]\t$pos_inc\t$mean\t$p\n";
369 } else {
370 print OUT "$fasta_ids[$old_id]\t$pos_inc\t$mean\n";
371 }
372 # Remove old score from beginning.
373 shift(@scores);
374 }
375
376 # Reset for new paragraph.
377 $pos_inc = 0;
378 @scores = ();
379 $old_id = $cur_id;
380 # Save first score.
381 push(@scores, $score);
382 next;
383
384 }
385
386 # Push score in array for average calculation.
387 push(@scores, $score);
388
389 # Case: array smaller $win.
390 if (scalar(@scores) < $win) {
391 # Subcase: array more than half size, start with calculation.
392 if (scalar(@scores) >= ($ap_extlr + 1)) {
393 # Calculate avg array score.
394 my $mean = array_mean(@scores);
395 $c_out++;
396 $pos_inc++;
397 if ($add_p) {
398 my $y = exp(-1*(1 + ( ($mean-$distr_my)/$distr_sigma )*$distr_xi)**(-1/$distr_xi));
399 my $p = sprintf("%.8f", (1-$y));
400 print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\t$p\n";
401 } else {
402 print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\n";
403 }
404 }
405 next;
406 }
407
408 # Case: array has $win size.
409 if (scalar(@scores) == $win) {
410 # Calculate avg array score.
411 my $mean = array_mean(@scores);
412 $c_out++;
413 $pos_inc++;
414 if ($add_p) {
415 my $y = exp(-1*(1 + ( ($mean-$distr_my)/$distr_sigma )*$distr_xi)**(-1/$distr_xi));
416 my $p = sprintf("%.8f", (1-$y));
417 print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\t$p\n";
418 } else {
419 print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\n";
420 }
421 # Remove old score from beginning.
422 shift(@scores);
423 next;
424 }
425 }
426 }
427
428 # Print remaining entries at the end of last section.
429 while (scalar(@scores) >= ($ap_extlr + 1)) {
430 # Calculate avg array score.
431 my $mean = array_mean(@scores);
432 $c_out++;
433 $pos_inc++;
434 if ($add_p) {
435 my $y = exp(-1*(1 + ( ($mean-$distr_my)/$distr_sigma )*$distr_xi)**(-1/$distr_xi));
436 my $p = sprintf("%.8f", (1-$y));
437 print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\t$p\n";
438 } else {
439 print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\n";
440 }
441 # Remove old score from beginning.
442 shift(@scores);
443 }
444 close IN;
445 close OUT;
446
447 qx/rm -f $profile/;
448
449
450 #########################
451 # GET PEAK REGIONS.
452 #########################
453
454
455 my $p50_peaks_bed_out = $data_id . ".peak_regions_p50.bed";
456 my $peaks_bed_out = $data_id . ".peak_regions.bed";
457
458 # If p-values were calculated, use set p-value to get peak regions.
459 if ($add_p) {
460 extract_peak_regions_from_p_values($average_profile, $peaks_bed_out, $max_merge_dist, $thr_p);
461 # If p50-out set.
462 if ($i_p50_out) {
463 # If p50 p-value present, also get peak regions file for this threshold.
464 if ($p50_p) {
465 extract_peak_regions_from_p_values($average_profile, $p50_peaks_bed_out, $max_merge_dist, $p50_p);
466 } else {
467 qx/touch $p50_peaks_bed_out/;
468 }
469 }
470 } else {
471 # If no p-values available, use score threshold for defining peak regions.
472 extract_peak_regions_from_scores($average_profile, $peaks_bed_out, $max_merge_dist, $thr_sc);
473 # If p50-out set.
474 if ($i_p50_out) {
475 # If p50 score present, also get peak regions file for this threshold.
476 if ($p50_sc) {
477 extract_peak_regions_from_scores($average_profile, $p50_peaks_bed_out, $max_merge_dist, $p50_sc);
478 } else {
479 qx/touch $p50_peaks_bed_out/;
480 }
481 }
482 }
483
484 exit;
485
486
487
488 ################################################################################
489 ################################################################################
490 # SUBROUTINES.
491 ################################################################################
492
493
494 sub array_mean {
495
496 my $mean = sum(@_)/@_;
497 return sprintf("%.5f", $mean);
498
499 }
500
501
502 ################################################################################
503
504 sub extract_peak_regions_from_p_values {
505
506 my ($average_profile, $peak_regions_bed_file, $max_merge_dist, $max_p) = @_;
507
508 my $old_ref = "no";
509 my $in = "N";
510 my $ref_id;
511 my $start;
512 my $end;
513 my $best_p = 1;
514 my $best_sc = -1000;
515 my $region_s;
516 my $region_e;
517 my $zero_pos;
518
519 my $temp_bed_file = "peak_regions.tmp.bed";
520
521 open (IN, $average_profile) or die "Cannot open $average_profile: $!";
522 open (OUT, '>', $temp_bed_file) or die "Cannot open $temp_bed_file: $!";
523
524 while (<IN>) {
525 chomp;
526 my ($ref, $s, $sc, $p) = (split /\t/)[0,1,2,3];
527 # If file has zero-based positions.
528 if ($s == 0) {
529 $zero_pos = 1;
530 }
531 # If positions are one-based, make them zero-based.
532 unless ($zero_pos) {
533 $s -= 1;
534 }
535 # At transcript ends, if in positive region, write and reset.
536 if ($old_ref ne $ref) {
537 if ($in eq "Y") {
538 print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc;$best_p\t0\t+\n";
539 $in = "N";
540 }
541 }
542 $old_ref = $ref;
543 # Deal with good p-value regions.
544 if ($p <= $max_p) {
545 # Start of a positive cluster.
546 if ($in eq "N") {
547 $start = $s;
548 $region_s = $s;
549 $region_e = $s + 1;
550 $end = $s + 1;
551 $best_p = $p;
552 $best_sc = $sc;
553 $ref_id = $ref;
554 $in = "Y";
555 next;
556 # Inside a positive cluster.
557 } elsif ($in eq "Y") {
558 if ($p < $best_p) {
559 $start = $s;
560 $end = $s + 1;
561 $best_p = $p;
562 $best_sc = $sc;
563 $ref_id = $ref;
564 }
565 $region_e++;
566 next;
567 }
568 } else {
569 # If we were in positive cluster before.
570 if ($in eq "Y") {
571 print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc;$best_p\t0\t+\n";
572 $in = "N";
573 }
574 }
575 }
576 # After last line processed.
577 if ($in eq "Y") {
578 print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc;$best_p\t0\t+\n";
579 $in = "N";
580 }
581 close IN;
582 close OUT;
583 # If merge distance zero (i.e. end of one block is -1 from start of next block).
584 if ($max_merge_dist == 0) {
585 qx/cat $temp_bed_file > $peak_regions_bed_file/;
586 } else {
587 # Merge nearby regions.
588 open(IN, $temp_bed_file) or die "Cannot open $temp_bed_file: $!";
589 open(OUT, '>', $peak_regions_bed_file) or die "Cannot open $peak_regions_bed_file: $!";
590 # For storing current block stats.
591 my $block_chr = 0;
592 my ($block_s, $block_e, $block_best_pos, $block_best_p, $block_best_sc);
593 while (<IN>) {
594 chomp;
595 my ($chr, $s, $e, $id) = (split /\t/)[0,1,2,3];
596 my ($best_pos, $best_sc, $best_p) = (split /;/, $id);
597 if ($chr eq $block_chr) {
598 # If $block_e, $s within merge merge.
599 if ( ($s - $block_e) <= $max_merge_dist ) {
600 # Update block stats.
601 $block_e = $e;
602 if ($block_best_p > $best_p) {
603 $block_best_p = $best_p;
604 $block_best_sc = $best_sc;
605 $block_best_pos = $best_pos;
606 }
607 } else {
608 # If $e outside merge range, print block.
609 print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc;$block_best_p\t0\t+\n";
610 # Store new block.
611 ($block_chr, $block_s, $block_e, $block_best_pos, $block_best_sc, $block_best_p) = ($chr, $s, $e, $best_pos, $best_sc, $best_p);
612 }
613
614 } else {
615 # If new chromosome, print last block, otherwise it is the first block.
616 if ($block_chr) {
617 print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc;$block_best_p\t0\t+\n";
618 }
619 ($block_chr, $block_s, $block_e, $block_best_pos, $block_best_sc, $block_best_p) = ($chr, $s, $e, $best_pos, $best_sc, $best_p);
620 }
621 }
622 # Print last block.
623 if ($block_chr) {
624 print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc;$block_best_p\t0\t+\n";
625 }
626 close OUT;
627 close IN;
628 }
629 qx/rm -f $temp_bed_file/;
630 }
631
632
633 ################################################################################
634
635 sub extract_peak_regions_from_scores {
636
637 my ($average_profile, $peak_regions_bed_file, $max_merge_dist, $min_sc) = @_;
638
639 my $old_ref = "no";
640 my $in = "N";
641 my $ref_id;
642 my $start;
643 my $end;
644 my $best_sc = -1000;
645 my $region_s;
646 my $region_e;
647 my $zero_pos;
648
649 my $temp_bed_file = "peak_regions.tmp.bed";
650
651 open (IN, $average_profile) or die "Cannot open $average_profile: $!";
652 open (OUT, '>', $temp_bed_file) or die "Cannot open $temp_bed_file: $!";
653
654 while (<IN>) {
655 chomp;
656 my ($ref, $s, $sc) = (split /\t/)[0,1,2];
657 # If file has zero-based positions.
658 if ($s == 0) {
659 $zero_pos = 1;
660 }
661 # If positions are one-based, make them zero-based.
662 unless ($zero_pos) {
663 $s -= 1;
664 }
665 # At transcript ends, if in positive region, write and reset.
666 if ($old_ref ne $ref) {
667 if ($in eq "Y") {
668 print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc\t0\t+\n";
669 $in = "N";
670 }
671 }
672 $old_ref = $ref;
673 # Deal with positive regions.
674 if ($sc > $min_sc) {
675 # Start of a positive cluster.
676 if ($in eq "N") {
677 $start = $s;
678 $region_s = $s;
679 $region_e = $s + 1;
680 $end = $s + 1;
681 $best_sc = $sc;
682 $ref_id = $ref;
683 $in = "Y";
684 next;
685 # Inside a positive cluster.
686 } elsif ($in eq "Y") {
687 if ($sc > $best_sc) {
688 $start = $s;
689 $end = $s + 1;
690 $best_sc = $sc;
691 $ref_id = $ref;
692 }
693 $region_e++;
694 next;
695 }
696 } else {
697 # If we were in positive cluster before.
698 if ($in eq "Y") {
699 print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc\t0\t+\n";
700 $in = "N";
701 }
702 }
703 }
704 # After last line processed.
705 if ($in eq "Y") {
706 print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc\t0\t+\n";
707 $in = "N";
708 }
709 close IN;
710 close OUT;
711 # If merge distance zero (i.e. end of one block is -1 from start of next block).
712 if ($max_merge_dist == 0) {
713 qx/cat $temp_bed_file > $peak_regions_bed_file/;
714 } else {
715 # Merge nearby regions.
716 open(IN, $temp_bed_file) or die "Cannot open $temp_bed_file: $!";
717 open(OUT, '>', $peak_regions_bed_file) or die "Cannot open $peak_regions_bed_file: $!";
718 # For storing current block stats.
719 my $block_chr = 0;
720 my ($block_s, $block_e, $block_best_pos, $block_best_sc);
721 while (<IN>) {
722 chomp;
723 my ($chr, $s, $e, $id) = (split /\t/)[0,1,2,3];
724 my ($best_pos, $best_sc) = (split /;/, $id);
725 if ($chr eq $block_chr) {
726 # If $block_e, $s within merge merge.
727 if ( ($s - $block_e) <= $max_merge_dist ) {
728 # Update block stats.
729 $block_e = $e;
730 if ($block_best_sc < $best_sc) {
731 $block_best_sc = $best_sc;
732 $block_best_pos = $best_pos;
733 }
734 } else {
735 # If $e outside merge range, print block.
736 print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc\t0\t+\n";
737 # Store new block.
738 ($block_chr, $block_s, $block_e, $block_best_pos, $block_best_sc) = ($chr, $s, $e, $best_pos, $best_sc);
739 }
740
741 } else {
742 # If new chromosome, print last block, otherwise it is the first block.
743 if ($block_chr) {
744 print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc\t0\t+\n";
745 }
746 ($block_chr, $block_s, $block_e, $block_best_pos, $block_best_sc) = ($chr, $s, $e, $best_pos, $best_sc);
747 }
748
749 }
750 # Print last block.
751 if ($block_chr) {
752 print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc\t0\t+\n";
753 }
754 close OUT;
755 close IN;
756 }
757 qx/rm -f $temp_bed_file/;
758 }
759
760
761 ################################################################################
762
763
764