Mercurial > repos > rnateam > graphprot_predict_profile
comparison graphprot_predict_profile_wrapper.pl @ 0:215925e588c4 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit 9bb5f3c8ed8e87ec5652b5bc8bf9c774d5534a1a
author | rnateam |
---|---|
date | Fri, 25 May 2018 12:17:44 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:215925e588c4 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 use Getopt::Long; | |
6 use Pod::Usage; | |
7 use Cwd qw(getcwd abs_path); | |
8 use List::Util qw(sum); | |
9 | |
10 =head1 NAME | |
11 | |
12 =head1 SYNOPSIS | |
13 | |
14 Galaxy wrapper script for GraphProt (-action predict_profile) to compute | |
15 the binding profile for a given model on a given set of sequences provided | |
16 in FASTA format. After profile prediction, average profiles get computed, | |
17 scores signified and binding peak regions extracted. The score signification | |
18 is done using the provided fitted GEV parameters, either from .params file | |
19 or manually set. If score threshold is set (-thr-sc), p-value assignment | |
20 will be skipped and set score threshold will be used to extract peak | |
21 regions. NOTE: Additional lines .params file are used to store and get | |
22 GEV parameters, as well as type of model (model_type: sequence|structure). | |
23 Also, this wrapper currently works for classification mode only. | |
24 | |
25 PARAMETERS: | |
26 | |
27 -help|h display help page | |
28 -fasta Input FASTA file (option -fasta) | |
29 -model Input .model file (option -model) | |
30 -params Input .params file | |
31 NOTE: uses .params file with additional | |
32 parameters | |
33 Manually set parameters (below) will override | |
34 found settings in .params file | |
35 -data-id Data ID (option -prefix) | |
36 | |
37 GraphProt model parameters (by default get from .params file): | |
38 | |
39 -onlyseq Set if model is a sequence model | |
40 -R GraphProt model R parameter | |
41 -D GraphProt model D parameter | |
42 -epochs GraphProt model epochs parameter | |
43 -lambda GraphProt model lambda parameter | |
44 -bitsize GraphProt model bitsize parameter | |
45 -abstraction GraphProt model RNAshapes abstraction level | |
46 parameter (set for structure models) | |
47 | |
48 Peak region extraction parameters: | |
49 | |
50 -thr-sc Score threshold for extracting peak regions | |
51 By default p-value of 0.05 is used. If no p-value | |
52 calculation possible, -thr-sc is used with default: 0 | |
53 -thr-p p-value threshold for extracting peak regions | |
54 By default, peak regions with p = 0.05 are extracted, | |
55 as well as p50 score peak regions (if info given) | |
56 Default: 0.05 | |
57 -merge-dist Maximum merge distance for nearby peak regions | |
58 Default: report all non-overlapping regions | |
59 -p50-out Output p50 score filtered peak regions BED file | |
60 default: false | |
61 | |
62 GEV distribution parameters: | |
63 | |
64 -distr-my GEV distribution my parameter for calculating p-values | |
65 from scores | |
66 -distr-sigma GEV distrubution sigma parameter for calculating | |
67 p-values from scores | |
68 -distr-xi GEV distribution xi parameter for calculating p-values | |
69 from scores | |
70 -ap-extlr Used average profile left right extension for | |
71 averaging scores, which were used for distribution | |
72 fitting. NOTE: usually a value of 5 was used for | |
73 for getting GEV distribution and parameters. If you | |
74 choose a different value here, calculated p-values | |
75 will be wrong! | |
76 default : 5 | |
77 | |
78 | |
79 =head1 DISCRIPTION | |
80 | |
81 5) Write manual | |
82 6) add output p50 file with NOTE | |
83 | |
84 6) put GP into rna_tools | |
85 | |
86 NOTE: | |
87 Additional lines .params file used to store and get gev parameters, as well | |
88 as type of model (model_type: sequence|structure). | |
89 | |
90 Example .params content: | |
91 epochs: 20 | |
92 lambda: 0.001 | |
93 R: 1 | |
94 D: 4 | |
95 bitsize: 14 | |
96 model_type: sequence | |
97 #ADDITIONAL MODEL PARAMETERS | |
98 ap_extlr: 5 | |
99 gev_my: -2.5408 | |
100 gev_sigma: 1.6444 | |
101 gev_xi: -0.1383 | |
102 p50_score: 6.51534 | |
103 p50_p_val: 0.0009059744 | |
104 | |
105 =cut | |
106 | |
107 ############################ | |
108 # COMMAND LINE CHECKING. | |
109 ############################ | |
110 | |
111 # Command line argument variables. | |
112 my ($i_help, $i_fasta, $i_model, $i_params, $i_data_id, $i_thr_sc, $i_thr_p, $i_max_merge_dist, $i_p50_out, $i_gp_r, $i_gp_d, $i_gp_epochs, $i_gp_lambda, $i_gp_bitsize, $i_gp_abstr, $i_gp_onlyseq, $i_distr_my, $i_distr_sigma, $i_distr_xi, $i_ap_extlr); | |
113 | |
114 # Parse the command line. | |
115 GetOptions ( "help|h" => \$i_help, | |
116 "fasta:s" => \$i_fasta, | |
117 "model:s" => \$i_model, | |
118 "params:s" => \$i_params, | |
119 "data-id:s" => \$i_data_id, | |
120 "thr-sc:f" => \$i_thr_sc, | |
121 "thr-p:f" => \$i_thr_p, | |
122 "p50-out" => \$i_p50_out, | |
123 "merge-dist:i" => \$i_max_merge_dist, | |
124 "R:i" => \$i_gp_r, | |
125 "D:i" => \$i_gp_d, | |
126 "epochs:i" => \$i_gp_epochs, | |
127 "lambda:f" => \$i_gp_lambda, | |
128 "bitsize:i" => \$i_gp_bitsize, | |
129 "abstr:i" => \$i_gp_abstr, | |
130 "onlyseq" => \$i_gp_onlyseq, | |
131 "distr-my:f" => \$i_distr_my, | |
132 "distr-sigma:f" => \$i_distr_sigma, | |
133 "distr-xi:f" => \$i_distr_xi, | |
134 "ap-extlr:i" => \$i_ap_extlr ) or pod2usage(1); | |
135 | |
136 # Check. | |
137 pod2usage(1) if $i_help; | |
138 ($i_fasta and $i_model) or pod2usage "ERROR: -fasta, -model are mandatory"; | |
139 if ($i_distr_my or $i_distr_sigma or $i_distr_xi) { | |
140 ($i_distr_my and $i_distr_sigma and $i_distr_xi) or pod2usage "ERROR: expects all three distribution parameters to be set"; | |
141 } | |
142 | |
143 ####################### | |
144 # SET PARAMETERS. | |
145 ####################### | |
146 | |
147 # Prefix. | |
148 my $data_id = "GraphProt"; | |
149 if ($i_data_id) { | |
150 $data_id = $i_data_id; | |
151 } | |
152 my $thr_sc = 0; | |
153 if ($i_thr_sc) { | |
154 $thr_sc = $i_thr_sc; | |
155 } | |
156 my $thr_p = 0.05; | |
157 if ($i_thr_p) { | |
158 $thr_p = $i_thr_p; | |
159 } | |
160 my $ap_extlr = 5; | |
161 if ($i_ap_extlr) { | |
162 $ap_extlr = $i_ap_extlr; | |
163 } | |
164 my $max_merge_dist = 0; | |
165 if ($i_max_merge_dist) { | |
166 $max_merge_dist = $i_max_merge_dist; | |
167 } | |
168 | |
169 # Get parameters from .params file. | |
170 my %params; | |
171 if ($i_params) { | |
172 open(IN, $i_params) or die "Cannot open $i_params: $!"; | |
173 while(<IN>) { | |
174 next if ($_ =~ /^#/); | |
175 if ($_ =~ /(.+):\s(.+)/) { | |
176 $params{$1} = $2; | |
177 } | |
178 } | |
179 close IN; | |
180 } | |
181 | |
182 # Create GP parameter string. | |
183 my $params_string = ""; | |
184 # -onlyseq | |
185 my $model_type = "structure"; | |
186 if (exists $params{"model_type"}) { | |
187 if ($params{"model_type"} eq "sequence") { | |
188 $params_string .= " -onlyseq"; | |
189 $model_type = "sequence"; | |
190 } | |
191 } elsif ($i_gp_onlyseq) { | |
192 $params_string .= " -onlyseq"; | |
193 $model_type = "sequence"; | |
194 } | |
195 # -R | |
196 if ($i_gp_r) { | |
197 $params_string .= " -R $i_gp_r"; | |
198 } elsif (exists $params{"R"}) { | |
199 my $v = $params{"R"}; | |
200 $params_string .= " -R $v"; | |
201 } else { | |
202 die "ERROR: -R needs to be set"; | |
203 } | |
204 # -D | |
205 if ($i_gp_d) { | |
206 $params_string .= " -D $i_gp_d"; | |
207 } elsif (exists $params{"D"}) { | |
208 my $v = $params{"D"}; | |
209 $params_string .= " -D $v"; | |
210 } else { | |
211 die "ERROR: -D needs to be set"; | |
212 } | |
213 # -epochs | |
214 if ($i_gp_epochs) { | |
215 $params_string .= " -epochs $i_gp_epochs"; | |
216 } elsif (exists $params{"epochs"}) { | |
217 my $v = $params{"epochs"}; | |
218 $params_string .= " -epochs $v"; | |
219 } else { | |
220 die "ERROR: -epochs needs to be set"; | |
221 } | |
222 # -lambda | |
223 if ($i_gp_lambda) { | |
224 $params_string .= " -lambda $i_gp_lambda"; | |
225 } elsif (exists $params{"lambda"}) { | |
226 my $v = $params{"lambda"}; | |
227 $params_string .= " -lambda $v"; | |
228 } else { | |
229 die "ERROR: -lambda needs to be set"; | |
230 } | |
231 # -bitsize | |
232 if ($i_gp_bitsize) { | |
233 $params_string .= " -bitsize $i_gp_bitsize"; | |
234 } elsif (exists $params{"bitsize"}) { | |
235 my $v = $params{"bitsize"}; | |
236 $params_string .= " -bitsize $v"; | |
237 } else { | |
238 die "ERROR: -bitsize needs to be set"; | |
239 } | |
240 # -abstraction | |
241 if ($i_gp_abstr) { | |
242 $params_string .= " -bitsize $i_gp_abstr"; | |
243 die "ERROR: -abstraction set with -onlyseq" unless ($model_type eq "structure"); | |
244 } elsif (exists $params{"abstraction"}) { | |
245 my $v = $params{"abstraction"}; | |
246 $params_string .= " -abstraction $v"; | |
247 die "ERROR: -abstraction set with -onlyseq" unless ($model_type eq "structure"); | |
248 } | |
249 | |
250 # Distribution parameters. | |
251 my ($distr_my, $distr_sigma, $distr_xi); | |
252 if ($i_distr_my) { | |
253 $distr_my = $i_distr_my; | |
254 } elsif (exists $params{"gev_my"}) { | |
255 $distr_my = $params{"gev_my"}; | |
256 } | |
257 if ($i_distr_sigma) { | |
258 $distr_sigma = $i_distr_sigma; | |
259 } elsif (exists $params{"gev_sigma"}) { | |
260 $distr_sigma = $params{"gev_sigma"}; | |
261 } | |
262 if ($i_distr_xi) { | |
263 $distr_xi = $i_distr_xi; | |
264 } elsif (exists $params{"gev_xi"}) { | |
265 $distr_xi = $params{"gev_xi"}; | |
266 } | |
267 # Average profile extension parameter. | |
268 if (exists $params{"ap_extlr"}) { | |
269 $ap_extlr = $params{"ap_extlr"}; | |
270 } | |
271 | |
272 print STDOUT "model_type: $model_type\n"; | |
273 print STDOUT "ap_extlr: $ap_extlr\n"; | |
274 | |
275 if ($distr_my and $distr_sigma and $distr_xi) { | |
276 print STDOUT "distr_my: $distr_my\n"; | |
277 print STDOUT "distr_sigma: $distr_sigma\n"; | |
278 print STDOUT "distr_xi: $distr_xi\n"; | |
279 } | |
280 print STDOUT "max_merge_dist: $max_merge_dist\n"; | |
281 | |
282 # p50 filter score. | |
283 my $p50_sc; | |
284 if (exists $params{"p50_score"}) { | |
285 $p50_sc = $params{"p50_score"}; | |
286 print STDOUT "p50_score: $p50_sc\n"; | |
287 } | |
288 # p50 p-value. | |
289 my $p50_p; | |
290 if (exists $params{"p50_p_val"}) { | |
291 $p50_p = $params{"p50_p_val"}; | |
292 print STDOUT "p50_p_val: $p50_p\n"; | |
293 } | |
294 | |
295 | |
296 | |
297 ################################## | |
298 # RUN GP PROFILE PREDICTION. | |
299 ################################## | |
300 | |
301 # Read in FASTA file headers. | |
302 my @fasta_ids; | |
303 my $headers = qx/grep ">" $i_fasta/; | |
304 while ($headers =~ />(.+?)\n/g) { | |
305 push(@fasta_ids,$1); | |
306 } | |
307 | |
308 # Run GP profile prediction. | |
309 my $gp_call = "GraphProt.pl -action predict_profile -model $i_model -fasta $i_fasta -prefix $data_id $params_string"; | |
310 print STDOUT "GraphProt call: $gp_call\n"; | |
311 # &> profile_prediction.log | |
312 qx/$gp_call/; | |
313 | |
314 | |
315 #################################### | |
316 # CALCULATE AVERAGE PROFILE. | |
317 #################################### | |
318 | |
319 # Calculate .average_profile from GraphProt .profile file. | |
320 # Also add p-value column if distr parameters set. | |
321 my $add_p = 0; | |
322 if ($distr_my and $distr_sigma and $distr_xi) { | |
323 $add_p = 1; | |
324 } | |
325 | |
326 # Average_profile: 1-based, FASTA headers as IDs, using ap_extlr for averaging scores. | |
327 my $profile = "$data_id.profile"; | |
328 my $average_profile = "$data_id.average_profile"; | |
329 # Calculate window size. | |
330 my $win = $ap_extlr * 2 + 1; | |
331 | |
332 open (IN, $profile) or die "Cannot open $profile: $!\n"; | |
333 open (OUT, '>', $average_profile) or die "Cannot open $average_profile: $!"; | |
334 | |
335 # Old ID. | |
336 my $old_id = "-"; | |
337 # Current ID. | |
338 my $cur_id = "-"; | |
339 # Start position of the window. | |
340 my $pos_inc = 0; | |
341 # Score array. | |
342 my @scores; | |
343 # Input row counter. | |
344 my $c_in = 0; | |
345 # Output row counter. | |
346 my $c_out = 0; | |
347 | |
348 while (<IN>) { | |
349 | |
350 if ($_ =~ /(.+?)\t\d+?\t(.+?)\n/) { | |
351 | |
352 $cur_id = $1; | |
353 my $score = $2; | |
354 $c_in++; | |
355 | |
356 # Case: New refseq ID / new paragraph. | |
357 if ($cur_id ne $old_id) { | |
358 | |
359 # Print remaining entries at the end of former paragraph. | |
360 while (scalar(@scores) >= ($ap_extlr + 1)) { | |
361 # Calculate avg array score. | |
362 my $mean = array_mean(@scores); | |
363 $c_out++; | |
364 $pos_inc++; | |
365 if ($add_p) { | |
366 my $y = exp(-1*(1 + ( ($mean-$distr_my)/$distr_sigma )*$distr_xi)**(-1/$distr_xi)); | |
367 my $p = sprintf("%.8f", (1-$y)); | |
368 print OUT "$fasta_ids[$old_id]\t$pos_inc\t$mean\t$p\n"; | |
369 } else { | |
370 print OUT "$fasta_ids[$old_id]\t$pos_inc\t$mean\n"; | |
371 } | |
372 # Remove old score from beginning. | |
373 shift(@scores); | |
374 } | |
375 | |
376 # Reset for new paragraph. | |
377 $pos_inc = 0; | |
378 @scores = (); | |
379 $old_id = $cur_id; | |
380 # Save first score. | |
381 push(@scores, $score); | |
382 next; | |
383 | |
384 } | |
385 | |
386 # Push score in array for average calculation. | |
387 push(@scores, $score); | |
388 | |
389 # Case: array smaller $win. | |
390 if (scalar(@scores) < $win) { | |
391 # Subcase: array more than half size, start with calculation. | |
392 if (scalar(@scores) >= ($ap_extlr + 1)) { | |
393 # Calculate avg array score. | |
394 my $mean = array_mean(@scores); | |
395 $c_out++; | |
396 $pos_inc++; | |
397 if ($add_p) { | |
398 my $y = exp(-1*(1 + ( ($mean-$distr_my)/$distr_sigma )*$distr_xi)**(-1/$distr_xi)); | |
399 my $p = sprintf("%.8f", (1-$y)); | |
400 print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\t$p\n"; | |
401 } else { | |
402 print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\n"; | |
403 } | |
404 } | |
405 next; | |
406 } | |
407 | |
408 # Case: array has $win size. | |
409 if (scalar(@scores) == $win) { | |
410 # Calculate avg array score. | |
411 my $mean = array_mean(@scores); | |
412 $c_out++; | |
413 $pos_inc++; | |
414 if ($add_p) { | |
415 my $y = exp(-1*(1 + ( ($mean-$distr_my)/$distr_sigma )*$distr_xi)**(-1/$distr_xi)); | |
416 my $p = sprintf("%.8f", (1-$y)); | |
417 print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\t$p\n"; | |
418 } else { | |
419 print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\n"; | |
420 } | |
421 # Remove old score from beginning. | |
422 shift(@scores); | |
423 next; | |
424 } | |
425 } | |
426 } | |
427 | |
428 # Print remaining entries at the end of last section. | |
429 while (scalar(@scores) >= ($ap_extlr + 1)) { | |
430 # Calculate avg array score. | |
431 my $mean = array_mean(@scores); | |
432 $c_out++; | |
433 $pos_inc++; | |
434 if ($add_p) { | |
435 my $y = exp(-1*(1 + ( ($mean-$distr_my)/$distr_sigma )*$distr_xi)**(-1/$distr_xi)); | |
436 my $p = sprintf("%.8f", (1-$y)); | |
437 print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\t$p\n"; | |
438 } else { | |
439 print OUT "$fasta_ids[$cur_id]\t$pos_inc\t$mean\n"; | |
440 } | |
441 # Remove old score from beginning. | |
442 shift(@scores); | |
443 } | |
444 close IN; | |
445 close OUT; | |
446 | |
447 qx/rm -f $profile/; | |
448 | |
449 | |
450 ######################### | |
451 # GET PEAK REGIONS. | |
452 ######################### | |
453 | |
454 | |
455 my $p50_peaks_bed_out = $data_id . ".peak_regions_p50.bed"; | |
456 my $peaks_bed_out = $data_id . ".peak_regions.bed"; | |
457 | |
458 # If p-values were calculated, use set p-value to get peak regions. | |
459 if ($add_p) { | |
460 extract_peak_regions_from_p_values($average_profile, $peaks_bed_out, $max_merge_dist, $thr_p); | |
461 # If p50-out set. | |
462 if ($i_p50_out) { | |
463 # If p50 p-value present, also get peak regions file for this threshold. | |
464 if ($p50_p) { | |
465 extract_peak_regions_from_p_values($average_profile, $p50_peaks_bed_out, $max_merge_dist, $p50_p); | |
466 } else { | |
467 qx/touch $p50_peaks_bed_out/; | |
468 } | |
469 } | |
470 } else { | |
471 # If no p-values available, use score threshold for defining peak regions. | |
472 extract_peak_regions_from_scores($average_profile, $peaks_bed_out, $max_merge_dist, $thr_sc); | |
473 # If p50-out set. | |
474 if ($i_p50_out) { | |
475 # If p50 score present, also get peak regions file for this threshold. | |
476 if ($p50_sc) { | |
477 extract_peak_regions_from_scores($average_profile, $p50_peaks_bed_out, $max_merge_dist, $p50_sc); | |
478 } else { | |
479 qx/touch $p50_peaks_bed_out/; | |
480 } | |
481 } | |
482 } | |
483 | |
484 exit; | |
485 | |
486 | |
487 | |
488 ################################################################################ | |
489 ################################################################################ | |
490 # SUBROUTINES. | |
491 ################################################################################ | |
492 | |
493 | |
494 sub array_mean { | |
495 | |
496 my $mean = sum(@_)/@_; | |
497 return sprintf("%.5f", $mean); | |
498 | |
499 } | |
500 | |
501 | |
502 ################################################################################ | |
503 | |
504 sub extract_peak_regions_from_p_values { | |
505 | |
506 my ($average_profile, $peak_regions_bed_file, $max_merge_dist, $max_p) = @_; | |
507 | |
508 my $old_ref = "no"; | |
509 my $in = "N"; | |
510 my $ref_id; | |
511 my $start; | |
512 my $end; | |
513 my $best_p = 1; | |
514 my $best_sc = -1000; | |
515 my $region_s; | |
516 my $region_e; | |
517 my $zero_pos; | |
518 | |
519 my $temp_bed_file = "peak_regions.tmp.bed"; | |
520 | |
521 open (IN, $average_profile) or die "Cannot open $average_profile: $!"; | |
522 open (OUT, '>', $temp_bed_file) or die "Cannot open $temp_bed_file: $!"; | |
523 | |
524 while (<IN>) { | |
525 chomp; | |
526 my ($ref, $s, $sc, $p) = (split /\t/)[0,1,2,3]; | |
527 # If file has zero-based positions. | |
528 if ($s == 0) { | |
529 $zero_pos = 1; | |
530 } | |
531 # If positions are one-based, make them zero-based. | |
532 unless ($zero_pos) { | |
533 $s -= 1; | |
534 } | |
535 # At transcript ends, if in positive region, write and reset. | |
536 if ($old_ref ne $ref) { | |
537 if ($in eq "Y") { | |
538 print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc;$best_p\t0\t+\n"; | |
539 $in = "N"; | |
540 } | |
541 } | |
542 $old_ref = $ref; | |
543 # Deal with good p-value regions. | |
544 if ($p <= $max_p) { | |
545 # Start of a positive cluster. | |
546 if ($in eq "N") { | |
547 $start = $s; | |
548 $region_s = $s; | |
549 $region_e = $s + 1; | |
550 $end = $s + 1; | |
551 $best_p = $p; | |
552 $best_sc = $sc; | |
553 $ref_id = $ref; | |
554 $in = "Y"; | |
555 next; | |
556 # Inside a positive cluster. | |
557 } elsif ($in eq "Y") { | |
558 if ($p < $best_p) { | |
559 $start = $s; | |
560 $end = $s + 1; | |
561 $best_p = $p; | |
562 $best_sc = $sc; | |
563 $ref_id = $ref; | |
564 } | |
565 $region_e++; | |
566 next; | |
567 } | |
568 } else { | |
569 # If we were in positive cluster before. | |
570 if ($in eq "Y") { | |
571 print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc;$best_p\t0\t+\n"; | |
572 $in = "N"; | |
573 } | |
574 } | |
575 } | |
576 # After last line processed. | |
577 if ($in eq "Y") { | |
578 print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc;$best_p\t0\t+\n"; | |
579 $in = "N"; | |
580 } | |
581 close IN; | |
582 close OUT; | |
583 # If merge distance zero (i.e. end of one block is -1 from start of next block). | |
584 if ($max_merge_dist == 0) { | |
585 qx/cat $temp_bed_file > $peak_regions_bed_file/; | |
586 } else { | |
587 # Merge nearby regions. | |
588 open(IN, $temp_bed_file) or die "Cannot open $temp_bed_file: $!"; | |
589 open(OUT, '>', $peak_regions_bed_file) or die "Cannot open $peak_regions_bed_file: $!"; | |
590 # For storing current block stats. | |
591 my $block_chr = 0; | |
592 my ($block_s, $block_e, $block_best_pos, $block_best_p, $block_best_sc); | |
593 while (<IN>) { | |
594 chomp; | |
595 my ($chr, $s, $e, $id) = (split /\t/)[0,1,2,3]; | |
596 my ($best_pos, $best_sc, $best_p) = (split /;/, $id); | |
597 if ($chr eq $block_chr) { | |
598 # If $block_e, $s within merge merge. | |
599 if ( ($s - $block_e) <= $max_merge_dist ) { | |
600 # Update block stats. | |
601 $block_e = $e; | |
602 if ($block_best_p > $best_p) { | |
603 $block_best_p = $best_p; | |
604 $block_best_sc = $best_sc; | |
605 $block_best_pos = $best_pos; | |
606 } | |
607 } else { | |
608 # If $e outside merge range, print block. | |
609 print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc;$block_best_p\t0\t+\n"; | |
610 # Store new block. | |
611 ($block_chr, $block_s, $block_e, $block_best_pos, $block_best_sc, $block_best_p) = ($chr, $s, $e, $best_pos, $best_sc, $best_p); | |
612 } | |
613 | |
614 } else { | |
615 # If new chromosome, print last block, otherwise it is the first block. | |
616 if ($block_chr) { | |
617 print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc;$block_best_p\t0\t+\n"; | |
618 } | |
619 ($block_chr, $block_s, $block_e, $block_best_pos, $block_best_sc, $block_best_p) = ($chr, $s, $e, $best_pos, $best_sc, $best_p); | |
620 } | |
621 } | |
622 # Print last block. | |
623 if ($block_chr) { | |
624 print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc;$block_best_p\t0\t+\n"; | |
625 } | |
626 close OUT; | |
627 close IN; | |
628 } | |
629 qx/rm -f $temp_bed_file/; | |
630 } | |
631 | |
632 | |
633 ################################################################################ | |
634 | |
635 sub extract_peak_regions_from_scores { | |
636 | |
637 my ($average_profile, $peak_regions_bed_file, $max_merge_dist, $min_sc) = @_; | |
638 | |
639 my $old_ref = "no"; | |
640 my $in = "N"; | |
641 my $ref_id; | |
642 my $start; | |
643 my $end; | |
644 my $best_sc = -1000; | |
645 my $region_s; | |
646 my $region_e; | |
647 my $zero_pos; | |
648 | |
649 my $temp_bed_file = "peak_regions.tmp.bed"; | |
650 | |
651 open (IN, $average_profile) or die "Cannot open $average_profile: $!"; | |
652 open (OUT, '>', $temp_bed_file) or die "Cannot open $temp_bed_file: $!"; | |
653 | |
654 while (<IN>) { | |
655 chomp; | |
656 my ($ref, $s, $sc) = (split /\t/)[0,1,2]; | |
657 # If file has zero-based positions. | |
658 if ($s == 0) { | |
659 $zero_pos = 1; | |
660 } | |
661 # If positions are one-based, make them zero-based. | |
662 unless ($zero_pos) { | |
663 $s -= 1; | |
664 } | |
665 # At transcript ends, if in positive region, write and reset. | |
666 if ($old_ref ne $ref) { | |
667 if ($in eq "Y") { | |
668 print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc\t0\t+\n"; | |
669 $in = "N"; | |
670 } | |
671 } | |
672 $old_ref = $ref; | |
673 # Deal with positive regions. | |
674 if ($sc > $min_sc) { | |
675 # Start of a positive cluster. | |
676 if ($in eq "N") { | |
677 $start = $s; | |
678 $region_s = $s; | |
679 $region_e = $s + 1; | |
680 $end = $s + 1; | |
681 $best_sc = $sc; | |
682 $ref_id = $ref; | |
683 $in = "Y"; | |
684 next; | |
685 # Inside a positive cluster. | |
686 } elsif ($in eq "Y") { | |
687 if ($sc > $best_sc) { | |
688 $start = $s; | |
689 $end = $s + 1; | |
690 $best_sc = $sc; | |
691 $ref_id = $ref; | |
692 } | |
693 $region_e++; | |
694 next; | |
695 } | |
696 } else { | |
697 # If we were in positive cluster before. | |
698 if ($in eq "Y") { | |
699 print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc\t0\t+\n"; | |
700 $in = "N"; | |
701 } | |
702 } | |
703 } | |
704 # After last line processed. | |
705 if ($in eq "Y") { | |
706 print OUT "$ref_id\t$region_s\t$region_e\t$end;$best_sc\t0\t+\n"; | |
707 $in = "N"; | |
708 } | |
709 close IN; | |
710 close OUT; | |
711 # If merge distance zero (i.e. end of one block is -1 from start of next block). | |
712 if ($max_merge_dist == 0) { | |
713 qx/cat $temp_bed_file > $peak_regions_bed_file/; | |
714 } else { | |
715 # Merge nearby regions. | |
716 open(IN, $temp_bed_file) or die "Cannot open $temp_bed_file: $!"; | |
717 open(OUT, '>', $peak_regions_bed_file) or die "Cannot open $peak_regions_bed_file: $!"; | |
718 # For storing current block stats. | |
719 my $block_chr = 0; | |
720 my ($block_s, $block_e, $block_best_pos, $block_best_sc); | |
721 while (<IN>) { | |
722 chomp; | |
723 my ($chr, $s, $e, $id) = (split /\t/)[0,1,2,3]; | |
724 my ($best_pos, $best_sc) = (split /;/, $id); | |
725 if ($chr eq $block_chr) { | |
726 # If $block_e, $s within merge merge. | |
727 if ( ($s - $block_e) <= $max_merge_dist ) { | |
728 # Update block stats. | |
729 $block_e = $e; | |
730 if ($block_best_sc < $best_sc) { | |
731 $block_best_sc = $best_sc; | |
732 $block_best_pos = $best_pos; | |
733 } | |
734 } else { | |
735 # If $e outside merge range, print block. | |
736 print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc\t0\t+\n"; | |
737 # Store new block. | |
738 ($block_chr, $block_s, $block_e, $block_best_pos, $block_best_sc) = ($chr, $s, $e, $best_pos, $best_sc); | |
739 } | |
740 | |
741 } else { | |
742 # If new chromosome, print last block, otherwise it is the first block. | |
743 if ($block_chr) { | |
744 print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc\t0\t+\n"; | |
745 } | |
746 ($block_chr, $block_s, $block_e, $block_best_pos, $block_best_sc) = ($chr, $s, $e, $best_pos, $best_sc); | |
747 } | |
748 | |
749 } | |
750 # Print last block. | |
751 if ($block_chr) { | |
752 print OUT "$block_chr\t$block_s\t$block_e\t$block_best_pos;$block_best_sc\t0\t+\n"; | |
753 } | |
754 close OUT; | |
755 close IN; | |
756 } | |
757 qx/rm -f $temp_bed_file/; | |
758 } | |
759 | |
760 | |
761 ################################################################################ | |
762 | |
763 | |
764 |