Mercurial > repos > bgruening > bismark
comparison bismark_methylation_extractor @ 3:91f07ff056ca draft
Uploaded
author | bgruening |
---|---|
date | Mon, 14 Apr 2014 16:43:14 -0400 |
parents | 62c6da72dd4a |
children |
comparison
equal
deleted
inserted
replaced
2:82814a8a2395 | 3:91f07ff056ca |
---|---|
6 use Cwd; | 6 use Cwd; |
7 use Carp; | 7 use Carp; |
8 use FindBin qw($Bin); | 8 use FindBin qw($Bin); |
9 use lib "$Bin/../lib"; | 9 use lib "$Bin/../lib"; |
10 | 10 |
11 | |
11 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) | 12 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) |
12 | 13 |
13 ## This program is free software: you can redistribute it and/or modify | 14 ## This program is free software: you can redistribute it and/or modify |
14 ## it under the terms of the GNU General Public License as published by | 15 ## it under the terms of the GNU General Public License as published by |
15 ## the Free Software Foundation, either version 3 of the License, or | 16 ## the Free Software Foundation, either version 3 of the License, or |
27 my %counting; | 28 my %counting; |
28 my $parent_dir = getcwd(); | 29 my $parent_dir = getcwd(); |
29 | 30 |
30 my %fhs; | 31 my %fhs; |
31 | 32 |
32 my $version = 'v0.7.11'; | 33 my $version = 'v0.10.1'; |
33 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip) = process_commandline(); | 34 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem) = process_commandline(); |
34 | 35 |
35 | 36 |
36 ### only needed for bedGraph output | 37 ### only needed for bedGraph output |
37 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files | 38 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files |
38 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total | 39 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total |
39 my @bedfiles; | 40 my @bedfiles; |
40 | 41 |
41 ### only needed for genome-wide cytosine methylation report | 42 ### only needed for genome-wide cytosine methylation report |
42 my %chromosomes; | 43 my %chromosomes; |
43 | 44 |
45 my %mbias_1; | |
46 my %mbias_2; | |
47 | |
44 ############################################################################################## | 48 ############################################################################################## |
45 ### Summarising Run Parameters | 49 ### Summarising Run Parameters |
46 ############################################################################################## | 50 ############################################################################################## |
47 | 51 |
48 ### METHYLATION EXTRACTOR | 52 ### METHYLATION EXTRACTOR |
65 else{ | 69 else{ |
66 warn "Bismark paired-end SAM format specified (default)\n"; # default | 70 warn "Bismark paired-end SAM format specified (default)\n"; # default |
67 } | 71 } |
68 } | 72 } |
69 | 73 |
70 if ($ignore){ | 74 if ($single){ |
71 warn "First $ignore bases will be disregarded when processing the methylation call string\n"; | 75 if ($ignore){ |
76 warn "First $ignore bp will be disregarded when processing the methylation call string\n"; | |
77 } | |
72 } | 78 } |
79 else{ ## paired-end | |
80 if ($ignore){ | |
81 warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n"; | |
82 } | |
83 if ($ignore_r2){ | |
84 warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n"; | |
85 } | |
86 } | |
87 | |
73 | 88 |
74 if ($full){ | 89 if ($full){ |
75 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; | 90 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; |
76 } | 91 } |
77 if ($merge_non_CpG){ | 92 if ($merge_non_CpG){ |
93 if ($bedGraph){ | 108 if ($bedGraph){ |
94 warn "\n\nSummarising bedGraph parameters:\n"; | 109 warn "\n\nSummarising bedGraph parameters:\n"; |
95 warn '='x63,"\n"; | 110 warn '='x63,"\n"; |
96 | 111 |
97 if ($counts){ | 112 if ($counts){ |
98 warn "Generating additional output in bedGraph format including methylating counts (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>)\n"; | 113 warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage>\ncoverage format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>\n\n"; |
99 } | 114 } |
100 else{ | 115 else{ |
101 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n"; | 116 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n"; |
102 } | 117 } |
103 | 118 |
113 | 128 |
114 if ($remove){ | 129 if ($remove){ |
115 warn "White spaces in read ID names will be removed prior to sorting\n"; | 130 warn "White spaces in read ID names will be removed prior to sorting\n"; |
116 } | 131 } |
117 | 132 |
118 if (defined $sort_size){ | 133 if ($ample_mem){ |
134 warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n"; | |
135 } | |
136 elsif (defined $sort_size){ | |
119 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n"; | 137 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n"; |
120 } | 138 } |
121 else{ | 139 else{ |
122 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n"; | 140 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n"; |
123 } | 141 } |
184 total_unmethylated_CHG_count => 0, | 202 total_unmethylated_CHG_count => 0, |
185 total_unmethylated_CHH_count => 0, | 203 total_unmethylated_CHH_count => 0, |
186 total_unmethylated_CpG_count => 0, | 204 total_unmethylated_CpG_count => 0, |
187 sequences_count => 0, | 205 sequences_count => 0, |
188 ); | 206 ); |
207 | |
189 @sorting_files = (); | 208 @sorting_files = (); |
190 @bedfiles = (); | 209 @bedfiles = (); |
210 | |
211 %mbias_1 = (); | |
212 %mbias_2 = (); | |
213 | |
214 ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor | |
215 unless ($vanilla){ | |
216 if ($paired){ | |
217 test_positional_sorting($filename); | |
218 } | |
219 } | |
191 | 220 |
192 process_Bismark_results_file($filename); | 221 process_Bismark_results_file($filename); |
193 | 222 |
194 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues | 223 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues |
195 foreach my $fh (keys %fhs) { | 224 foreach my $fh (keys %fhs) { |
196 if ($fh =~ /^[1230]$/) { | 225 if ($fh =~ /^[1230]$/) { |
197 foreach my $context (keys %{$fhs{$fh}}) { | 226 foreach my $context (keys %{$fhs{$fh}}) { |
198 close $fhs{$fh}->{$context} or die $!; | 227 close $fhs{$fh}->{$context} or die $!; |
199 | 228 } |
200 } | 229 } |
201 } else { | 230 else{ |
202 close $fhs{$fh} or die $!; | 231 close $fhs{$fh} or die $!; |
203 } | 232 } |
204 } | 233 } |
234 | |
235 ### printing out all M-Bias data | |
236 produce_mbias_plots ($filename); | |
237 | |
238 delete_unused_files(); | |
205 | 239 |
206 if ($bedGraph){ | 240 if ($bedGraph){ |
207 | 241 |
208 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified | 242 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified |
209 $out =~ s/gz$//; | 243 $out =~ s/gz$//; |
210 $out =~ s/sam$//; | 244 $out =~ s/sam$//; |
211 $out =~ s/bam$//; | 245 $out =~ s/bam$//; |
212 $out =~ s/txt$//; | 246 $out =~ s/txt$//; |
213 $out =~ s/$/bedGraph/; | 247 $out =~ s/$/bedGraph/; |
214 | 248 |
215 | |
216 | |
217 my $bedGraph_output = $out; | 249 my $bedGraph_output = $out; |
218 my @args; | 250 my @args; |
219 | 251 |
220 if ($remove){ | 252 if ($remove){ |
221 push @args, '--remove'; | 253 push @args, '--remove'; |
224 push @args, '--CX_context'; | 256 push @args, '--CX_context'; |
225 } | 257 } |
226 if ($no_header){ | 258 if ($no_header){ |
227 push @args, '--no_header'; | 259 push @args, '--no_header'; |
228 } | 260 } |
229 if ($counts){ | 261 if ($gazillion){ |
230 push @args, "--counts"; | 262 push @args, '--gazillion'; |
231 } | 263 } |
264 if ($ample_mem){ | |
265 push @args, '--ample_memory'; | |
266 } | |
267 | |
268 | |
269 # if ($counts){ | |
270 # push @args, "--counts"; | |
271 # } | |
232 | 272 |
233 push @args, "--buffer_size $sort_size"; | 273 push @args, "--buffer_size $sort_size"; |
234 push @args, "--cutoff $coverage_threshold"; | 274 push @args, "--cutoff $coverage_threshold"; |
235 push @args, "--output $bedGraph_output"; | 275 push @args, "--output $bedGraph_output"; |
236 push @args, "--dir '$output_dir'"; | 276 push @args, "--dir '$output_dir'"; |
252 # process_bedGraph_output(); | 292 # process_bedGraph_output(); |
253 # close OUT or die $!; | 293 # close OUT or die $!; |
254 | 294 |
255 ### genome-wide cytosine methylation report requires bedGraph processing anyway | 295 ### genome-wide cytosine methylation report requires bedGraph processing anyway |
256 if ($cytosine_report){ | 296 if ($cytosine_report){ |
297 | |
257 @args = (); # resetting @args | 298 @args = (); # resetting @args |
258 my $cytosine_out = $out; | 299 my $cytosine_out = $out; |
259 $cytosine_out =~ s/bedGraph$//; | 300 $cytosine_out =~ s/bedGraph$//; |
260 | 301 |
261 if ($CX_context){ | 302 if ($CX_context){ |
278 } | 319 } |
279 if ($split_by_chromosome){ | 320 if ($split_by_chromosome){ |
280 push @args, '--split_by_chromosome'; | 321 push @args, '--split_by_chromosome'; |
281 } | 322 } |
282 | 323 |
283 push @args, $bedGraph_output; # this will be the infile | 324 my $coverage_output = $bedGraph_output; |
284 | 325 $coverage_output =~ s/bedGraph$/bismark.cov/; |
285 system ("$Bin/bedGraph2cytosine @args"); | 326 |
327 push @args, $output_dir . $coverage_output; # this will be the infile | |
328 | |
329 system ("$Bin/coverage2cytosine @args"); | |
286 # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out); | 330 # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out); |
287 warn "\n\nFinished generating genome-wide cytosine report\n\n"; | 331 warn "\n\nFinished generating genome-wide cytosine report\n\n"; |
288 } | 332 } |
289 } | 333 } |
290 } | 334 } |
291 | 335 |
336 sub delete_unused_files{ | |
337 | |
338 warn "Deleting unused files ...\n\n"; sleep(1); | |
339 | |
340 my $index = 0; | |
341 | |
342 while ($index <= $#sorting_files){ | |
343 if ($sorting_files[$index] =~ /gz$/){ | |
344 open (USED,"zcat $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; | |
345 } | |
346 else{ | |
347 open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; | |
348 } | |
349 | |
350 my $used = 0; | |
351 | |
352 while (<USED>){ | |
353 next if (/^Bismark/); | |
354 if ($_){ | |
355 $used = 1; | |
356 last; | |
357 } | |
358 } | |
359 | |
360 if ($used){ | |
361 warn "$sorting_files[$index] contains data ->\tkept\n"; | |
362 ++$index; | |
363 } | |
364 else{ | |
365 | |
366 my $delete = unlink $sorting_files[$index]; | |
367 | |
368 if ($delete){ | |
369 warn "$sorting_files[$index] was empty ->\tdeleted\n"; | |
370 } | |
371 else{ | |
372 warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n" | |
373 } | |
374 | |
375 ### we also need to remove the element from @sorting_files | |
376 splice @sorting_files, $index, 1; | |
377 } | |
378 } | |
379 warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately) | |
380 } | |
381 | |
382 sub produce_mbias_plots{ | |
383 | |
384 my $filename = shift; | |
385 | |
386 my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified | |
387 $mbias =~ s/gz$//; | |
388 $mbias =~ s/sam$//; | |
389 $mbias =~ s/bam$//; | |
390 $mbias =~ s/txt$//; | |
391 my $mbias_graph_1 = my $mbias_graph_2 = $mbias; | |
392 $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png'; | |
393 $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png'; | |
394 | |
395 $mbias =~ s/$/M-bias.txt/; | |
396 | |
397 open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n"; | |
398 | |
399 # determining maximum read length | |
400 my $max_length_1 = 0; | |
401 my $max_length_2 = 0; | |
402 | |
403 foreach my $context (keys %mbias_1){ | |
404 foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){ | |
405 $max_length_1 = $pos unless ($max_length_1 >= $pos); | |
406 } | |
407 } | |
408 if ($paired){ | |
409 foreach my $context (keys %mbias_2){ | |
410 foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){ | |
411 $max_length_2 = $pos unless ($max_length_2 >= $pos); | |
412 } | |
413 } | |
414 } | |
415 | |
416 if ($single){ | |
417 warn "Determining maximum read length for M-Bias plot\n"; | |
418 warn "Maximum read length of Read 1: $max_length_1\n\n"; | |
419 } | |
420 else{ | |
421 warn "Determining maximum read lengths for M-Bias plots\n"; | |
422 warn "Maximum read length of Read 1: $max_length_1\n"; | |
423 warn "Maximum read length of Read 2: $max_length_2\n\n"; | |
424 } | |
425 # sleep(3); | |
426 | |
427 my @mbias_read1; | |
428 my @mbias_read2; | |
429 | |
430 #Check whether the module GD::Graph:lines is installed | |
431 my $gd_graph_installed = 0; | |
432 eval{ | |
433 require GD::Graph::lines; | |
434 GD::Graph::lines->import(); | |
435 }; | |
436 | |
437 unless($@) { # syntax or routine error variable, set if something goes wron in the last eval{ require ...} | |
438 $gd_graph_installed = 1; | |
439 | |
440 #Check whether the module GD::Graph::colour is installed | |
441 eval{ | |
442 require GD::Graph::colour; | |
443 GD::Graph::colour->import(qw(:colours :lists :files :convert)); | |
444 }; | |
445 | |
446 if ($@) { | |
447 warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; | |
448 sleep(2); | |
449 $gd_graph_installed = 0; | |
450 } | |
451 | |
452 | |
453 } | |
454 else{ | |
455 warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; | |
456 sleep(2); | |
457 } | |
458 | |
459 | |
460 my $graph_title; | |
461 my $graph1; | |
462 my $graph2; | |
463 | |
464 if ( $gd_graph_installed){ | |
465 $graph1 = GD::Graph::lines->new(800,600); | |
466 if ($paired){ | |
467 $graph2 = GD::Graph::lines->new(800,600); | |
468 } | |
469 } | |
470 | |
471 foreach my $context (qw(CpG CHG CHH)){ | |
472 @{$mbias_read1[0]} = (); | |
473 | |
474 if ($paired){ | |
475 print MBIAS "$context context (R1)\n================\n"; | |
476 $graph_title = 'M-bias (Read 1)'; | |
477 } | |
478 else{ | |
479 print MBIAS "$context context\n===========\n"; | |
480 $graph_title = 'M-bias'; | |
481 } | |
482 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; | |
483 | |
484 foreach my $pos (1..$max_length_1){ | |
485 | |
486 unless (defined $mbias_1{$context}->{$pos}->{meth}){ | |
487 $mbias_1{$context}->{$pos}->{meth} = 0; | |
488 } | |
489 unless (defined $mbias_1{$context}->{$pos}->{un}){ | |
490 $mbias_1{$context}->{$pos}->{un} = 0; | |
491 } | |
492 | |
493 my $percent = ''; | |
494 if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){ | |
495 $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) ); | |
496 } | |
497 my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth}; | |
498 | |
499 print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n"; | |
500 push @{$mbias_read1[0]},$pos; | |
501 | |
502 if ($context eq 'CpG'){ | |
503 push @{$mbias_read1[1]},$percent; | |
504 push @{$mbias_read1[4]},$coverage; | |
505 } | |
506 elsif ($context eq 'CHG'){ | |
507 push @{$mbias_read1[2]},$percent; | |
508 push @{$mbias_read1[5]},$coverage; | |
509 } | |
510 elsif ($context eq 'CHH'){ | |
511 push @{$mbias_read1[3]},$percent; | |
512 push @{$mbias_read1[6]},$coverage; | |
513 } | |
514 } | |
515 print MBIAS "\n"; | |
516 } | |
517 | |
518 if ( $gd_graph_installed){ | |
519 | |
520 add_colour(nice_blue => [31,120,180]); | |
521 add_colour(nice_orange => [255,127,0]); | |
522 add_colour(nice_green => [51,160,44]); | |
523 add_colour(pale_blue => [153,206,227]); | |
524 add_colour(pale_orange => [253,204,138]); | |
525 add_colour(pale_green => [191,230,207]); | |
526 | |
527 $graph1->set( | |
528 x_label => 'position (bp)', | |
529 y1_label => '% methylation', | |
530 y2_label => '# methylation calls', | |
531 title => $graph_title, | |
532 line_width => 2, | |
533 x_max_value => $max_length_1, | |
534 x_min_value => 0, | |
535 y_tick_number => 10, | |
536 y_label_skip => 2, | |
537 y1_max_value => 100, | |
538 y1_min_value => 0, | |
539 y_label_skip => 2, | |
540 y2_min_value => 0, | |
541 x_label_skip => 5, | |
542 x_label_position => 0.5, | |
543 x_tick_offset => -1, | |
544 bgclr => 'white', | |
545 transparent => 0, | |
546 two_axes => 1, | |
547 use_axis => [1,1,1,2,2,2], | |
548 legend_placement => 'RC', | |
549 legend_spacing => 6, | |
550 legend_marker_width => 24, | |
551 legend_marker_height => 18, | |
552 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], | |
553 ) or die $graph1->error; | |
554 | |
555 $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); | |
556 | |
557 my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error; | |
558 | |
559 open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n"; | |
560 binmode MBIAS_G1; | |
561 print MBIAS_G1 $gd1->png; | |
562 } | |
563 | |
564 if ($paired){ | |
565 | |
566 foreach my $context (qw(CpG CHG CHH)){ | |
567 @{$mbias_read2[0]} = (); | |
568 | |
569 print MBIAS "$context context (R2)\n================\n"; | |
570 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; | |
571 foreach my $pos (1..$max_length_2){ | |
572 | |
573 unless (defined $mbias_2{$context}->{$pos}->{meth}){ | |
574 $mbias_2{$context}->{$pos}->{meth} = 0; | |
575 } | |
576 unless (defined $mbias_2{$context}->{$pos}->{un}){ | |
577 $mbias_2{$context}->{$pos}->{un} = 0; | |
578 } | |
579 | |
580 my $percent = ''; | |
581 if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){ | |
582 $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) ); | |
583 } | |
584 my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth}; | |
585 | |
586 print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n"; | |
587 | |
588 push @{$mbias_read2[0]},$pos; | |
589 | |
590 if ($context eq 'CpG'){ | |
591 push @{$mbias_read2[1]},$percent; | |
592 push @{$mbias_read2[4]},$coverage; | |
593 } | |
594 elsif ($context eq 'CHG'){ | |
595 push @{$mbias_read2[2]},$percent; | |
596 push @{$mbias_read2[5]},$coverage; | |
597 } | |
598 elsif ($context eq 'CHH'){ | |
599 push @{$mbias_read2[3]},$percent; | |
600 push @{$mbias_read2[6]},$coverage; | |
601 } | |
602 } | |
603 print MBIAS "\n"; | |
604 } | |
605 | |
606 if ( $gd_graph_installed){ | |
607 | |
608 add_colour(nice_blue => [31,120,180]); | |
609 add_colour(nice_orange => [255,127,0]); | |
610 add_colour(nice_green => [51,160,44]); | |
611 add_colour(pale_blue => [153,206,227]); | |
612 add_colour(pale_orange => [253,204,138]); | |
613 add_colour(pale_green => [191,230,207]); | |
614 | |
615 $graph2->set( | |
616 x_label => 'position (bp)', | |
617 line_width => 2, | |
618 x_max_value => $max_length_1, | |
619 x_min_value => 0, | |
620 y_tick_number => 10, | |
621 y_label_skip => 2, | |
622 y1_max_value => 100, | |
623 y1_min_value => 0, | |
624 y_label_skip => 2, | |
625 y2_min_value => 0, | |
626 x_label_skip => 5, | |
627 x_label_position => 0.5, | |
628 x_tick_offset => -1, | |
629 bgclr => 'white', | |
630 transparent => 0, | |
631 two_axes => 1, | |
632 use_axis => [1,1,1,2,2,2], | |
633 legend_placement => 'RC', | |
634 legend_spacing => 6, | |
635 legend_marker_width => 24, | |
636 legend_marker_height => 18, | |
637 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], | |
638 x_label => 'position (bp)', | |
639 y1_label => '% methylation', | |
640 y2_label => '# calls', | |
641 title => 'M-bias (Read 2)', | |
642 ) or die $graph2->error; | |
643 | |
644 $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); | |
645 my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error; | |
646 | |
647 open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n"; | |
648 binmode MBIAS_G2; | |
649 print MBIAS_G2 $gd2->png; | |
650 | |
651 } | |
652 } | |
653 } | |
292 | 654 |
293 sub process_commandline{ | 655 sub process_commandline{ |
294 my $help; | 656 my $help; |
295 my $single_end; | 657 my $single_end; |
296 my $paired_end; | 658 my $paired_end; |
297 my $ignore; | 659 my $ignore; |
660 my $ignore_r2; | |
298 my $genomic_fasta; | 661 my $genomic_fasta; |
299 my $full; | 662 my $full; |
300 my $report; | 663 my $report; |
301 my $extractor_version; | 664 my $extractor_version; |
302 my $no_overlap; | 665 my $no_overlap; |
315 my $CX_context; | 678 my $CX_context; |
316 my $split_by_chromosome; | 679 my $split_by_chromosome; |
317 my $sort_size; | 680 my $sort_size; |
318 my $samtools_path; | 681 my $samtools_path; |
319 my $gzip; | 682 my $gzip; |
320 | 683 my $mbias_only; |
321 my $command_line = GetOptions ('help|man' => \$help, | 684 my $gazillion; |
322 'p|paired-end' => \$paired_end, | 685 my $ample_mem; |
323 's|single-end' => \$single_end, | 686 |
324 'fasta' => \$genomic_fasta, | 687 my $command_line = GetOptions ('help|man' => \$help, |
325 'ignore=i' => \$ignore, | 688 'p|paired-end' => \$paired_end, |
326 'comprehensive' => \$full, | 689 's|single-end' => \$single_end, |
327 'report' => \$report, | 690 'fasta' => \$genomic_fasta, |
328 'version' => \$extractor_version, | 691 'ignore=i' => \$ignore, |
329 'no_overlap' => \$no_overlap, | 692 'ignore_r2=i' => \$ignore_r2, |
330 'merge_non_CpG' => \$merge_non_CpG, | 693 'comprehensive' => \$full, |
331 'vanilla' => \$vanilla, | 694 'report' => \$report, |
332 'o|output=s' => \$output_dir, | 695 'version' => \$extractor_version, |
333 'no_header' => \$no_header, | 696 'no_overlap' => \$no_overlap, |
334 'bedGraph' => \$bedGraph, | 697 'merge_non_CpG' => \$merge_non_CpG, |
335 "cutoff=i" => \$coverage_threshold, | 698 'vanilla' => \$vanilla, |
336 "remove_spaces" => \$remove, | 699 'o|output=s' => \$output_dir, |
337 "counts" => \$counts, | 700 'no_header' => \$no_header, |
338 "cytosine_report" => \$cytosine_report, | 701 'bedGraph' => \$bedGraph, |
339 'g|genome_folder=s' => \$genome_folder, | 702 "cutoff=i" => \$coverage_threshold, |
340 "zero_based" => \$zero, | 703 "remove_spaces" => \$remove, |
341 "CX|CX_context" => \$CX_context, | 704 "counts" => \$counts, |
342 "split_by_chromosome" => \$split_by_chromosome, | 705 "cytosine_report" => \$cytosine_report, |
343 "buffer_size=s" => \$sort_size, | 706 'g|genome_folder=s' => \$genome_folder, |
344 'samtools_path=s' => \$samtools_path, | 707 "zero_based" => \$zero, |
345 "gzip" => \$gzip, | 708 "CX|CX_context" => \$CX_context, |
346 ); | 709 "split_by_chromosome" => \$split_by_chromosome, |
710 "buffer_size=s" => \$sort_size, | |
711 'samtools_path=s' => \$samtools_path, | |
712 "gzip" => \$gzip, | |
713 "mbias_only" => \$mbias_only, | |
714 "gazillion|scaffolds" => \$gazillion, | |
715 "ample_memory" => \$ample_mem, | |
716 ); | |
347 | 717 |
348 ### EXIT ON ERROR if there were errors with any of the supplied options | 718 ### EXIT ON ERROR if there were errors with any of the supplied options |
349 unless ($command_line){ | 719 unless ($command_line){ |
350 die "Please respecify command line options\n"; | 720 die "Please respecify command line options\n"; |
351 } | 721 } |
378 } | 748 } |
379 @filenames = @ARGV; | 749 @filenames = @ARGV; |
380 | 750 |
381 warn "\n *** Bismark methylation extractor version $version ***\n\n"; | 751 warn "\n *** Bismark methylation extractor version $version ***\n\n"; |
382 | 752 |
383 ### IGNORING <INT> bases at the start of the read when processing the methylation call string | 753 ### M-BIAS ONLY |
384 unless ($ignore){ | 754 if ($mbias_only){ |
385 $ignore = 0; | 755 if ($bedGraph){ |
386 } | 756 die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n"; |
757 } | |
758 if ($cytosine_report){ | |
759 die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n"; | |
760 } | |
761 if ($merge_non_CpG){ | |
762 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n"; | |
763 } | |
764 if ($full){ | |
765 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n"; | |
766 } | |
767 sleep(3); | |
768 } | |
769 | |
387 ### PRINT A REPORT | 770 ### PRINT A REPORT |
388 unless ($report){ | 771 unless ($report){ |
389 $report = 0; | 772 $report = 0; |
390 } | 773 } |
391 | 774 |
414 } | 797 } |
415 elsif ($paired_end){ | 798 elsif ($paired_end){ |
416 $single_end = 0; ### PAIRED-END ALIGNMENTS | 799 $single_end = 0; ### PAIRED-END ALIGNMENTS |
417 } | 800 } |
418 else{ | 801 else{ |
419 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n"; | 802 |
420 } | 803 ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header |
804 | |
805 if ($vanilla){ | |
806 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n"; | |
807 } | |
808 else{ # SAM/BAM format | |
809 | |
810 my $file = $filenames[0]; | |
811 warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1); | |
812 | |
813 ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file | |
814 if ($file =~ /\.gz$/){ | |
815 open (DETERMINE,"zcat $file |") or die "Unable to read from gzipped file $file: $!\n"; | |
816 } | |
817 elsif ($file =~ /\.bam$/ || `file -b $file` =~ /^gzip/){ | |
818 open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM file $file: $!\n"; | |
819 } | |
820 else{ | |
821 open (DETERMINE,$file) or die "Unable to read from $file: $!\n"; | |
822 } | |
823 | |
824 while (<DETERMINE>){ | |
825 last unless (/^\@/); | |
826 if ($_ =~ /^\@PG/){ | |
827 # warn "found the \@PG line:\n"; | |
828 # warn "$_"; | |
829 | |
830 if ($_ =~ /-1/ and $_ =~ /-2/){ | |
831 warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1); | |
832 $paired_end = 1; | |
833 $single_end = 0; | |
834 } | |
835 else{ | |
836 warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1); | |
837 $paired_end = 0; | |
838 $single_end = 1; | |
839 } | |
840 } | |
841 } | |
842 | |
843 close DETERMINE or warn $!; | |
844 | |
845 } | |
846 } | |
847 | |
848 ### IGNORING <INT> bases at the start of the read when processing the methylation call string | |
849 unless ($ignore){ | |
850 $ignore = 0; | |
851 } | |
852 | |
853 if (defined $ignore_r2){ | |
854 die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end); | |
855 } | |
856 else{ | |
857 $ignore_r2 = 0; | |
858 } | |
859 | |
421 | 860 |
422 ### NO OVERLAP | 861 ### NO OVERLAP |
423 if ($no_overlap){ | 862 if ($no_overlap){ |
424 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end); | 863 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end); |
425 } | 864 } |
472 else{ | 911 else{ |
473 $CX_context = 0; | 912 $CX_context = 0; |
474 } | 913 } |
475 | 914 |
476 unless ($counts){ | 915 unless ($counts){ |
477 $counts = 0; | 916 $counts = 1; # counts will always be set |
478 } | 917 } |
479 | 918 |
480 if ($cytosine_report){ | 919 if ($cytosine_report){ |
481 | 920 |
482 ### GENOME folder | 921 ### GENOME folder |
492 unless ($bedGraph){ | 931 unless ($bedGraph){ |
493 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; | 932 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; |
494 $bedGraph = 1; | 933 $bedGraph = 1; |
495 } | 934 } |
496 unless ($counts){ | 935 unless ($counts){ |
497 warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; | 936 # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; |
498 $counts = 1; | 937 $counts = 1; |
499 } | 938 } |
500 warn "\n"; | 939 warn "\n"; |
501 } | 940 } |
502 | 941 |
534 | 973 |
535 unless (defined $samtools_path){ | 974 unless (defined $samtools_path){ |
536 $samtools_path = ''; | 975 $samtools_path = ''; |
537 } | 976 } |
538 | 977 |
539 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip); | 978 |
979 if ($gazillion){ | |
980 if ($ample_mem){ | |
981 die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n"; | |
982 } | |
983 } | |
984 | |
985 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem); | |
986 } | |
987 | |
988 | |
989 sub test_positional_sorting{ | |
990 | |
991 my $filename = shift; | |
992 | |
993 print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t"; | |
994 sleep(1); | |
995 | |
996 if ($filename =~ /\.gz$/) { | |
997 open (TEST,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; | |
998 } | |
999 elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) { | |
1000 if ($samtools_path){ | |
1001 open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; | |
1002 } | |
1003 else{ | |
1004 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; | |
1005 } | |
1006 } | |
1007 else { | |
1008 open (TEST,$filename) or die "Can't open file $filename: $!\n"; | |
1009 } | |
1010 | |
1011 my $count = 0; | |
1012 | |
1013 while (<TEST>) { | |
1014 if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted) | |
1015 if (/^\@SO/) { | |
1016 die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n"; | |
1017 } | |
1018 next; | |
1019 } | |
1020 $count++; | |
1021 | |
1022 last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID | |
1023 | |
1024 my ($id_1) = (split (/\t/)); | |
1025 | |
1026 ### reading the next line which should be read 2 | |
1027 $_ = <TEST>; | |
1028 my ($id_2) = (split (/\t/)); | |
1029 last unless ($id_2); | |
1030 ++$count; | |
1031 | |
1032 if ($id_1 eq $id_2){ | |
1033 ### ids are the same | |
1034 next; | |
1035 } | |
1036 else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first | |
1037 my $id_1_trunc = $id_1; | |
1038 $id_1_trunc =~ s/\/1$//; | |
1039 my $id_2_trunc = $id_2; | |
1040 $id_2_trunc =~ s/\/2$//; | |
1041 | |
1042 unless ($id_1_trunc eq $id_2_trunc){ | |
1043 die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n"; | |
1044 } | |
1045 } | |
1046 } | |
1047 # close TEST or die $!; somehow fails on our cluster... | |
1048 ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other) | |
1049 warn "...passed!\n"; | |
1050 sleep(1); | |
1051 | |
540 } | 1052 } |
541 | 1053 |
542 | 1054 |
543 sub process_Bismark_results_file{ | 1055 sub process_Bismark_results_file{ |
544 my $filename = shift; | 1056 my $filename = shift; |
546 warn "\nNow reading in Bismark result file $filename\n\n"; | 1058 warn "\nNow reading in Bismark result file $filename\n\n"; |
547 | 1059 |
548 if ($filename =~ /\.gz$/) { | 1060 if ($filename =~ /\.gz$/) { |
549 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; | 1061 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; |
550 } | 1062 } |
551 elsif ($filename =~ /bam$/) { | 1063 elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) { |
552 if ($samtools_path){ | 1064 if ($samtools_path){ |
553 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; | 1065 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; |
554 } | 1066 } |
555 else{ | 1067 else{ |
556 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; | 1068 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; |
607 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; | 1119 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; |
608 } else { | 1120 } else { |
609 print REPORT "Bismark result file: single-end (SAM format)\n"; # default | 1121 print REPORT "Bismark result file: single-end (SAM format)\n"; # default |
610 } | 1122 } |
611 } | 1123 } |
612 | 1124 if ($single){ |
613 if ($ignore) { | 1125 if ($ignore) { |
614 print REPORT "Ignoring first $ignore bases\n"; | 1126 print REPORT "Ignoring first $ignore bp\n"; |
1127 } | |
1128 } | |
1129 else{ # paired-end | |
1130 if ($ignore) { | |
1131 print REPORT "Ignoring first $ignore bp of Read 1\n"; | |
1132 } | |
1133 if ($ignore_r2){ | |
1134 print REPORT "Ignoring first $ignore_r2 bp of Read 2\n"; | |
1135 } | |
615 } | 1136 } |
616 | 1137 |
617 if ($full) { | 1138 if ($full) { |
618 print REPORT "Output specified: comprehensive\n"; | 1139 print REPORT "Output specified: comprehensive\n"; |
619 } else { | 1140 } else { |
646 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); | 1167 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); |
647 $cpg_output = $output_dir . $cpg_output; | 1168 $cpg_output = $output_dir . $cpg_output; |
648 | 1169 |
649 if ($gzip){ | 1170 if ($gzip){ |
650 $cpg_output .= '.gz'; | 1171 $cpg_output .= '.gz'; |
651 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n"; | 1172 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); |
652 } | 1173 } |
653 else{ | 1174 else{ |
654 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; | 1175 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); |
655 } | 1176 } |
656 | 1177 |
657 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; | 1178 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); |
658 push @sorting_files,$cpg_output; | 1179 push @sorting_files,$cpg_output; |
659 | 1180 |
660 unless ($no_header) { | 1181 unless ($no_header) { |
661 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; | 1182 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
662 } | 1183 } |
663 | 1184 |
664 ### C in any other context than CpG | 1185 ### C in any other context than CpG |
665 $other_c_output =~ s/^/Non_CpG_context_/; | 1186 $other_c_output =~ s/^/Non_CpG_context_/; |
666 $other_c_output =~ s/sam$/txt/; | 1187 $other_c_output =~ s/sam$/txt/; |
668 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); | 1189 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); |
669 $other_c_output = $output_dir . $other_c_output; | 1190 $other_c_output = $output_dir . $other_c_output; |
670 | 1191 |
671 if ($gzip){ | 1192 if ($gzip){ |
672 $other_c_output .= '.gz'; | 1193 $other_c_output .= '.gz'; |
673 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n"; | 1194 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only); |
674 } | 1195 } |
675 else{ | 1196 else{ |
676 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n"; | 1197 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only); |
677 } | 1198 } |
678 | 1199 |
679 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n"; | 1200 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only); |
680 push @sorting_files,$other_c_output; | 1201 push @sorting_files,$other_c_output; |
681 | 1202 |
682 | 1203 |
683 unless ($no_header) { | 1204 unless ($no_header) { |
684 print {$fhs{other_context}} "Bismark methylation extractor version $version\n"; | 1205 print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
685 } | 1206 } |
686 } | 1207 } |
687 | 1208 |
688 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found | 1209 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found |
689 elsif ($merge_non_CpG) { | 1210 elsif ($merge_non_CpG) { |
697 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); | 1218 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); |
698 $cpg_ot = $output_dir . $cpg_ot; | 1219 $cpg_ot = $output_dir . $cpg_ot; |
699 | 1220 |
700 if ($gzip){ | 1221 if ($gzip){ |
701 $cpg_ot .= '.gz'; | 1222 $cpg_ot .= '.gz'; |
702 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n"; | 1223 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); |
703 } | 1224 } |
704 else{ | 1225 else{ |
705 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; | 1226 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); |
706 } | 1227 } |
707 | 1228 |
708 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; | 1229 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); |
709 push @sorting_files,$cpg_ot; | 1230 push @sorting_files,$cpg_ot; |
710 | 1231 |
711 unless($no_header){ | 1232 unless($no_header){ |
712 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; | 1233 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
713 } | 1234 } |
714 | 1235 |
715 $cpg_ctot =~ s/^/CpG_CTOT_/; | 1236 $cpg_ctot =~ s/^/CpG_CTOT_/; |
716 $cpg_ctot =~ s/sam$/txt/; | 1237 $cpg_ctot =~ s/sam$/txt/; |
717 $cpg_ctot =~ s/bam$/txt/; | 1238 $cpg_ctot =~ s/bam$/txt/; |
718 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); | 1239 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); |
719 $cpg_ctot = $output_dir . $cpg_ctot; | 1240 $cpg_ctot = $output_dir . $cpg_ctot; |
720 | 1241 |
721 if ($gzip){ | 1242 if ($gzip){ |
722 $cpg_ctot .= '.gz'; | 1243 $cpg_ctot .= '.gz'; |
723 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n"; | 1244 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); |
724 } | 1245 } |
725 else{ | 1246 else{ |
726 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; | 1247 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); |
727 } | 1248 } |
728 | 1249 |
729 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; | 1250 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); |
730 push @sorting_files,$cpg_ctot; | 1251 push @sorting_files,$cpg_ctot; |
731 | 1252 |
732 unless($no_header){ | 1253 unless($no_header){ |
733 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; | 1254 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
734 } | 1255 } |
735 | 1256 |
736 $cpg_ctob =~ s/^/CpG_CTOB_/; | 1257 $cpg_ctob =~ s/^/CpG_CTOB_/; |
737 $cpg_ctob =~ s/sam$/txt/; | 1258 $cpg_ctob =~ s/sam$/txt/; |
738 $cpg_ctob =~ s/bam$/txt/; | 1259 $cpg_ctob =~ s/bam$/txt/; |
739 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); | 1260 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); |
740 $cpg_ctob = $output_dir . $cpg_ctob; | 1261 $cpg_ctob = $output_dir . $cpg_ctob; |
741 | 1262 |
742 if ($gzip){ | 1263 if ($gzip){ |
743 $cpg_ctob .= '.gz'; | 1264 $cpg_ctob .= '.gz'; |
744 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n"; | 1265 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); |
745 } | 1266 } |
746 else{ | 1267 else{ |
747 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; | 1268 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); |
748 } | 1269 } |
749 | 1270 |
750 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; | 1271 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); |
751 push @sorting_files,$cpg_ctob; | 1272 push @sorting_files,$cpg_ctob; |
752 | 1273 |
753 unless($no_header){ | 1274 unless($no_header){ |
754 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; | 1275 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
755 } | 1276 } |
756 | 1277 |
757 $cpg_ob =~ s/^/CpG_OB_/; | 1278 $cpg_ob =~ s/^/CpG_OB_/; |
758 $cpg_ob =~ s/sam$/txt/; | 1279 $cpg_ob =~ s/sam$/txt/; |
759 $cpg_ob =~ s/bam$/txt/; | 1280 $cpg_ob =~ s/bam$/txt/; |
760 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); | 1281 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); |
761 $cpg_ob = $output_dir . $cpg_ob; | 1282 $cpg_ob = $output_dir . $cpg_ob; |
762 | 1283 |
763 if ($gzip){ | 1284 if ($gzip){ |
764 $cpg_ob .= '.gz'; | 1285 $cpg_ob .= '.gz'; |
765 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n"; | 1286 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); |
766 } | 1287 } |
767 else{ | 1288 else{ |
768 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; | 1289 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); |
769 } | 1290 } |
770 | 1291 |
771 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; | 1292 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); |
772 push @sorting_files,$cpg_ob; | 1293 push @sorting_files,$cpg_ob; |
773 | 1294 |
774 unless($no_header){ | 1295 unless($no_header){ |
775 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; | 1296 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
776 } | 1297 } |
777 | 1298 |
778 ### For cytosines in Non-CpG (CC, CT or CA) context | 1299 ### For cytosines in Non-CpG (CC, CT or CA) context |
779 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; | 1300 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; |
780 | 1301 |
784 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); | 1305 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); |
785 $other_c_ot = $output_dir . $other_c_ot; | 1306 $other_c_ot = $output_dir . $other_c_ot; |
786 | 1307 |
787 if ($gzip){ | 1308 if ($gzip){ |
788 $other_c_ot .= '.gz'; | 1309 $other_c_ot .= '.gz'; |
789 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n"; | 1310 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); |
790 } | 1311 } |
791 else{ | 1312 else{ |
792 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n"; | 1313 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); |
793 } | 1314 } |
794 | 1315 |
795 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n"; | 1316 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only); |
796 push @sorting_files,$other_c_ot; | 1317 push @sorting_files,$other_c_ot; |
797 | 1318 |
798 unless($no_header){ | 1319 unless($no_header){ |
799 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n"; | 1320 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
800 } | 1321 } |
801 | 1322 |
802 $other_c_ctot =~ s/^/Non_CpG_CTOT_/; | 1323 $other_c_ctot =~ s/^/Non_CpG_CTOT_/; |
803 $other_c_ctot =~ s/sam$/txt/; | 1324 $other_c_ctot =~ s/sam$/txt/; |
804 $other_c_ctot =~ s/bam$/txt/; | 1325 $other_c_ctot =~ s/bam$/txt/; |
805 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); | 1326 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); |
806 $other_c_ctot = $output_dir . $other_c_ctot; | 1327 $other_c_ctot = $output_dir . $other_c_ctot; |
807 | 1328 |
808 if ($gzip){ | 1329 if ($gzip){ |
809 $other_c_ctot .= '.gz'; | 1330 $other_c_ctot .= '.gz'; |
810 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n"; | 1331 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); |
811 } | 1332 } |
812 else{ | 1333 else{ |
813 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n"; | 1334 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); |
814 } | 1335 } |
815 | 1336 |
816 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n"; | 1337 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only); |
817 push @sorting_files,$other_c_ctot; | 1338 push @sorting_files,$other_c_ctot; |
818 | 1339 |
819 unless($no_header){ | 1340 unless($no_header){ |
820 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n"; | 1341 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
821 } | 1342 } |
822 | 1343 |
823 $other_c_ctob =~ s/^/Non_CpG_CTOB_/; | 1344 $other_c_ctob =~ s/^/Non_CpG_CTOB_/; |
824 $other_c_ctob =~ s/sam$/txt/; | 1345 $other_c_ctob =~ s/sam$/txt/; |
825 $other_c_ctob =~ s/bam$/txt/; | 1346 $other_c_ctob =~ s/bam$/txt/; |
826 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); | 1347 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); |
827 $other_c_ctob = $output_dir . $other_c_ctob; | 1348 $other_c_ctob = $output_dir . $other_c_ctob; |
828 | 1349 |
829 if ($gzip){ | 1350 if ($gzip){ |
830 $other_c_ctob .= '.gz'; | 1351 $other_c_ctob .= '.gz'; |
831 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n"; | 1352 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); |
832 } | 1353 } |
833 else{ | 1354 else{ |
834 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n"; | 1355 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); |
835 } | 1356 } |
836 | 1357 |
837 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n"; | 1358 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only); |
838 push @sorting_files,$other_c_ctob; | 1359 push @sorting_files,$other_c_ctob; |
839 | 1360 |
840 unless($no_header){ | 1361 unless($no_header){ |
841 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n"; | 1362 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
842 } | 1363 } |
843 | 1364 |
844 $other_c_ob =~ s/^/Non_CpG_OB_/; | 1365 $other_c_ob =~ s/^/Non_CpG_OB_/; |
845 $other_c_ob =~ s/sam$/txt/; | 1366 $other_c_ob =~ s/sam$/txt/; |
846 $other_c_ob =~ s/sam$/txt/; | 1367 $other_c_ob =~ s/sam$/txt/; |
847 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); | 1368 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); |
848 $other_c_ob = $output_dir . $other_c_ob; | 1369 $other_c_ob = $output_dir . $other_c_ob; |
849 | 1370 |
850 if ($gzip){ | 1371 if ($gzip){ |
851 $other_c_ob .= '.gz'; | 1372 $other_c_ob .= '.gz'; |
852 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n"; | 1373 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); |
853 } | 1374 } |
854 else{ | 1375 else{ |
855 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n"; | 1376 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); |
856 } | 1377 } |
857 | 1378 |
858 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n"; | 1379 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only); |
859 push @sorting_files,$other_c_ob; | 1380 push @sorting_files,$other_c_ob; |
860 | 1381 |
861 unless($no_header){ | 1382 unless($no_header){ |
862 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n"; | 1383 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
863 } | 1384 } |
864 } | 1385 } |
865 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) | 1386 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) |
866 | 1387 |
867 ### if --comprehensive was specified we are only writing one file per context | 1388 ### if --comprehensive was specified we are only writing one file per context |
874 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); | 1395 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); |
875 $cpg_output = $output_dir . $cpg_output; | 1396 $cpg_output = $output_dir . $cpg_output; |
876 | 1397 |
877 if ($gzip){ | 1398 if ($gzip){ |
878 $cpg_output .= '.gz'; | 1399 $cpg_output .= '.gz'; |
879 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n"; | 1400 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); |
880 } | 1401 } |
881 else{ | 1402 else{ |
882 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; | 1403 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); |
883 } | 1404 } |
884 | 1405 |
885 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; | 1406 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); |
886 push @sorting_files,$cpg_output; | 1407 push @sorting_files,$cpg_output; |
887 | 1408 |
888 unless($no_header){ | 1409 unless($no_header){ |
889 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; | 1410 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
890 } | 1411 } |
891 | 1412 |
892 ### C in CHG context | 1413 ### C in CHG context |
893 $chg_output =~ s/^/CHG_context_/; | 1414 $chg_output =~ s/^/CHG_context_/; |
894 $chg_output =~ s/sam$/txt/; | 1415 $chg_output =~ s/sam$/txt/; |
896 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); | 1417 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); |
897 $chg_output = $output_dir . $chg_output; | 1418 $chg_output = $output_dir . $chg_output; |
898 | 1419 |
899 if ($gzip){ | 1420 if ($gzip){ |
900 $chg_output .= '.gz'; | 1421 $chg_output .= '.gz'; |
901 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n"; | 1422 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only); |
902 } | 1423 } |
903 else{ | 1424 else{ |
904 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n"; | 1425 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only); |
905 } | 1426 } |
906 | 1427 |
907 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n"; | 1428 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only); |
908 push @sorting_files,$chg_output; | 1429 push @sorting_files,$chg_output; |
909 | 1430 |
910 unless($no_header){ | 1431 unless($no_header){ |
911 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n"; | 1432 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
912 } | 1433 } |
913 | 1434 |
914 ### C in CHH context | 1435 ### C in CHH context |
915 $chh_output =~ s/^/CHH_context_/; | 1436 $chh_output =~ s/^/CHH_context_/; |
916 $chh_output =~ s/sam$/txt/; | 1437 $chh_output =~ s/sam$/txt/; |
918 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); | 1439 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); |
919 $chh_output = $output_dir . $chh_output; | 1440 $chh_output = $output_dir . $chh_output; |
920 | 1441 |
921 if ($gzip){ | 1442 if ($gzip){ |
922 $chh_output .= '.gz'; | 1443 $chh_output .= '.gz'; |
923 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n"; | 1444 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only); |
924 } | 1445 } |
925 else{ | 1446 else{ |
926 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n"; | 1447 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only); |
927 } | 1448 } |
928 | 1449 |
929 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n"; | 1450 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only); |
930 push @sorting_files, $chh_output; | 1451 push @sorting_files, $chh_output; |
931 | 1452 |
932 unless($no_header){ | 1453 unless($no_header){ |
933 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n"; | 1454 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
934 } | 1455 } |
935 } | 1456 } |
936 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found | 1457 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found |
937 else { | 1458 else { |
938 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; | 1459 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; |
944 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); | 1465 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); |
945 $cpg_ot = $output_dir . $cpg_ot; | 1466 $cpg_ot = $output_dir . $cpg_ot; |
946 | 1467 |
947 if ($gzip){ | 1468 if ($gzip){ |
948 $cpg_ot .= '.gz'; | 1469 $cpg_ot .= '.gz'; |
949 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n"; | 1470 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); |
950 } | 1471 } |
951 else{ | 1472 else{ |
952 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; | 1473 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); |
953 } | 1474 } |
954 | 1475 |
955 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; | 1476 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); |
956 push @sorting_files,$cpg_ot; | 1477 push @sorting_files,$cpg_ot; |
957 | 1478 |
958 unless($no_header){ | 1479 unless($no_header){ |
959 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; | 1480 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
960 } | 1481 } |
961 | 1482 |
962 $cpg_ctot =~ s/^/CpG_CTOT_/; | 1483 $cpg_ctot =~ s/^/CpG_CTOT_/; |
963 $cpg_ctot =~ s/sam$/txt/; | 1484 $cpg_ctot =~ s/sam$/txt/; |
964 $cpg_ctot =~ s/bam$/txt/; | 1485 $cpg_ctot =~ s/bam$/txt/; |
965 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); | 1486 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); |
966 $cpg_ctot = $output_dir . $cpg_ctot; | 1487 $cpg_ctot = $output_dir . $cpg_ctot; |
967 | 1488 |
968 if ($gzip){ | 1489 if ($gzip){ |
969 $cpg_ctot .= '.gz'; | 1490 $cpg_ctot .= '.gz'; |
970 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n"; | 1491 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); |
971 } | 1492 } |
972 else{ | 1493 else{ |
973 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; | 1494 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); |
974 } | 1495 } |
975 | 1496 |
976 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; | 1497 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); |
977 push @sorting_files,$cpg_ctot; | 1498 push @sorting_files,$cpg_ctot; |
978 | 1499 |
979 unless($no_header){ | 1500 unless($no_header){ |
980 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; | 1501 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
981 } | 1502 } |
982 | 1503 |
983 $cpg_ctob =~ s/^/CpG_CTOB_/; | 1504 $cpg_ctob =~ s/^/CpG_CTOB_/; |
984 $cpg_ctob =~ s/sam$/txt/; | 1505 $cpg_ctob =~ s/sam$/txt/; |
985 $cpg_ctob =~ s/bam$/txt/; | 1506 $cpg_ctob =~ s/bam$/txt/; |
986 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); | 1507 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); |
987 $cpg_ctob = $output_dir . $cpg_ctob; | 1508 $cpg_ctob = $output_dir . $cpg_ctob; |
988 | 1509 |
989 if ($gzip){ | 1510 if ($gzip){ |
990 $cpg_ctob .= '.gz'; | 1511 $cpg_ctob .= '.gz'; |
991 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n"; | 1512 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); |
992 } | 1513 } |
993 else{ | 1514 else{ |
994 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; | 1515 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); |
995 } | 1516 } |
996 | 1517 |
997 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; | 1518 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); |
998 push @sorting_files,$cpg_ctob; | 1519 push @sorting_files,$cpg_ctob; |
999 | 1520 |
1000 unless($no_header){ | 1521 unless($no_header){ |
1001 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; | 1522 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
1002 } | 1523 } |
1003 | 1524 |
1004 $cpg_ob =~ s/^/CpG_OB_/; | 1525 $cpg_ob =~ s/^/CpG_OB_/; |
1005 $cpg_ob =~ s/sam$/txt/; | 1526 $cpg_ob =~ s/sam$/txt/; |
1006 $cpg_ob =~ s/bam$/txt/; | 1527 $cpg_ob =~ s/bam$/txt/; |
1007 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); | 1528 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); |
1008 $cpg_ob = $output_dir . $cpg_ob; | 1529 $cpg_ob = $output_dir . $cpg_ob; |
1009 | 1530 |
1010 if ($gzip){ | 1531 if ($gzip){ |
1011 $cpg_ob .= '.gz'; | 1532 $cpg_ob .= '.gz'; |
1012 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n"; | 1533 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); |
1013 } | 1534 } |
1014 else{ | 1535 else{ |
1015 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; | 1536 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); |
1016 } | 1537 } |
1017 | 1538 |
1018 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; | 1539 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); |
1019 push @sorting_files,$cpg_ob; | 1540 push @sorting_files,$cpg_ob; |
1020 | 1541 |
1021 unless($no_header){ | 1542 unless($no_header){ |
1022 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; | 1543 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
1023 } | 1544 } |
1024 | 1545 |
1025 ### For cytosines in CHG context | 1546 ### For cytosines in CHG context |
1026 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; | 1547 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; |
1027 | 1548 |
1031 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); | 1552 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); |
1032 $chg_ot = $output_dir . $chg_ot; | 1553 $chg_ot = $output_dir . $chg_ot; |
1033 | 1554 |
1034 if ($gzip){ | 1555 if ($gzip){ |
1035 $chg_ot .= '.gz'; | 1556 $chg_ot .= '.gz'; |
1036 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n"; | 1557 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only); |
1037 } | 1558 } |
1038 else{ | 1559 else{ |
1039 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n"; | 1560 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only); |
1040 } | 1561 } |
1041 | 1562 |
1042 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n"; | 1563 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only); |
1043 push @sorting_files,$chg_ot; | 1564 push @sorting_files,$chg_ot; |
1044 | 1565 |
1045 unless($no_header){ | 1566 unless($no_header){ |
1046 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n"; | 1567 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
1047 } | 1568 } |
1048 | 1569 |
1049 $chg_ctot =~ s/^/CHG_CTOT_/; | 1570 $chg_ctot =~ s/^/CHG_CTOT_/; |
1050 $chg_ctot =~ s/sam$/txt/; | 1571 $chg_ctot =~ s/sam$/txt/; |
1051 $chg_ctot =~ s/bam$/txt/; | 1572 $chg_ctot =~ s/bam$/txt/; |
1052 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); | 1573 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); |
1053 $chg_ctot = $output_dir . $chg_ctot; | 1574 $chg_ctot = $output_dir . $chg_ctot; |
1054 | 1575 |
1055 if ($gzip){ | 1576 if ($gzip){ |
1056 $chg_ctot .= '.gz'; | 1577 $chg_ctot .= '.gz'; |
1057 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n"; | 1578 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); |
1058 } | 1579 } |
1059 else{ | 1580 else{ |
1060 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n"; | 1581 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); |
1061 } | 1582 } |
1062 | 1583 |
1063 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n"; | 1584 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only); |
1064 push @sorting_files,$chg_ctot; | 1585 push @sorting_files,$chg_ctot; |
1065 | 1586 |
1066 unless($no_header){ | 1587 unless($no_header){ |
1067 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n"; | 1588 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
1068 } | 1589 } |
1069 | 1590 |
1070 $chg_ctob =~ s/^/CHG_CTOB_/; | 1591 $chg_ctob =~ s/^/CHG_CTOB_/; |
1071 $chg_ctob =~ s/sam$/txt/; | 1592 $chg_ctob =~ s/sam$/txt/; |
1072 $chg_ctob =~ s/bam$/txt/; | 1593 $chg_ctob =~ s/bam$/txt/; |
1073 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); | 1594 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); |
1074 $chg_ctob = $output_dir . $chg_ctob; | 1595 $chg_ctob = $output_dir . $chg_ctob; |
1075 | 1596 |
1076 if ($gzip){ | 1597 if ($gzip){ |
1077 $chg_ctob .= '.gz'; | 1598 $chg_ctob .= '.gz'; |
1078 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n"; | 1599 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); |
1079 } | 1600 } |
1080 else{ | 1601 else{ |
1081 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n"; | 1602 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); |
1082 } | 1603 } |
1083 | 1604 |
1084 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n"; | 1605 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only); |
1085 push @sorting_files,$chg_ctob; | 1606 push @sorting_files,$chg_ctob; |
1086 | 1607 |
1087 unless($no_header){ | 1608 unless($no_header){ |
1088 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n"; | 1609 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
1089 } | 1610 } |
1090 | 1611 |
1091 $chg_ob =~ s/^/CHG_OB_/; | 1612 $chg_ob =~ s/^/CHG_OB_/; |
1092 $chg_ob =~ s/sam$/txt/; | 1613 $chg_ob =~ s/sam$/txt/; |
1093 $chg_ob =~ s/bam$/txt/; | 1614 $chg_ob =~ s/bam$/txt/; |
1094 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); | 1615 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); |
1095 $chg_ob = $output_dir . $chg_ob; | 1616 $chg_ob = $output_dir . $chg_ob; |
1096 | 1617 |
1097 if ($gzip){ | 1618 if ($gzip){ |
1098 $chg_ob .= '.gz'; | 1619 $chg_ob .= '.gz'; |
1099 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n"; | 1620 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only); |
1100 } | 1621 } |
1101 else{ | 1622 else{ |
1102 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n"; | 1623 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only); |
1103 } | 1624 } |
1104 | 1625 |
1105 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n"; | 1626 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only); |
1106 push @sorting_files,$chg_ob; | 1627 push @sorting_files,$chg_ob; |
1107 | 1628 |
1108 unless($no_header){ | 1629 unless($no_header){ |
1109 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n"; | 1630 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
1110 } | 1631 } |
1111 | 1632 |
1112 ### For cytosines in CHH context | 1633 ### For cytosines in CHH context |
1113 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; | 1634 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; |
1114 | 1635 |
1118 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); | 1639 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); |
1119 $chh_ot = $output_dir . $chh_ot; | 1640 $chh_ot = $output_dir . $chh_ot; |
1120 | 1641 |
1121 if ($gzip){ | 1642 if ($gzip){ |
1122 $chh_ot .= '.gz'; | 1643 $chh_ot .= '.gz'; |
1123 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n"; | 1644 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only); |
1124 } | 1645 } |
1125 else{ | 1646 else{ |
1126 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n"; | 1647 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only); |
1127 } | 1648 } |
1128 | 1649 |
1129 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n"; | 1650 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only); |
1130 push @sorting_files,$chh_ot; | 1651 push @sorting_files,$chh_ot; |
1131 | 1652 |
1132 unless($no_header){ | 1653 unless($no_header){ |
1133 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n"; | 1654 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
1134 } | 1655 } |
1135 | 1656 |
1136 $chh_ctot =~ s/^/CHH_CTOT_/; | 1657 $chh_ctot =~ s/^/CHH_CTOT_/; |
1137 $chh_ctot =~ s/sam$/txt/; | 1658 $chh_ctot =~ s/sam$/txt/; |
1138 $chh_ctot =~ s/bam$/txt/; | 1659 $chh_ctot =~ s/bam$/txt/; |
1139 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); | 1660 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); |
1140 $chh_ctot = $output_dir . $chh_ctot; | 1661 $chh_ctot = $output_dir . $chh_ctot; |
1141 | 1662 |
1142 if ($gzip){ | 1663 if ($gzip){ |
1143 $chh_ctot .= '.gz'; | 1664 $chh_ctot .= '.gz'; |
1144 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n"; | 1665 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); |
1145 } | 1666 } |
1146 else{ | 1667 else{ |
1147 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n"; | 1668 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); |
1148 } | 1669 } |
1149 | 1670 |
1150 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n"; | 1671 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only); |
1151 push @sorting_files,$chh_ctot; | 1672 push @sorting_files,$chh_ctot; |
1152 | 1673 |
1153 unless($no_header){ | 1674 unless($no_header){ |
1154 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n"; | 1675 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
1155 } | 1676 } |
1156 | 1677 |
1157 $chh_ctob =~ s/^/CHH_CTOB_/; | 1678 $chh_ctob =~ s/^/CHH_CTOB_/; |
1158 $chh_ctob =~ s/sam$/txt/; | 1679 $chh_ctob =~ s/sam$/txt/; |
1159 $chh_ctob =~ s/bam$/txt/; | 1680 $chh_ctob =~ s/bam$/txt/; |
1160 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); | 1681 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); |
1161 $chh_ctob = $output_dir . $chh_ctob; | 1682 $chh_ctob = $output_dir . $chh_ctob; |
1162 | 1683 |
1163 if ($gzip){ | 1684 if ($gzip){ |
1164 $chh_ctob .= '.gz'; | 1685 $chh_ctob .= '.gz'; |
1165 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n"; | 1686 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); |
1166 } | 1687 } |
1167 else{ | 1688 else{ |
1168 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n"; | 1689 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); |
1169 } | 1690 } |
1170 | 1691 |
1171 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n"; | 1692 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only); |
1172 push @sorting_files,$chh_ctob; | 1693 push @sorting_files,$chh_ctob; |
1173 | 1694 |
1174 unless($no_header){ | 1695 unless($no_header){ |
1175 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n"; | 1696 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
1176 } | 1697 } |
1177 | 1698 |
1178 $chh_ob =~ s/^/CHH_OB_/; | 1699 $chh_ob =~ s/^/CHH_OB_/; |
1179 $chh_ob =~ s/sam$/txt/; | 1700 $chh_ob =~ s/sam$/txt/; |
1180 $chh_ob =~ s/bam$/txt/; | 1701 $chh_ob =~ s/bam$/txt/; |
1181 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); | 1702 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); |
1182 $chh_ob = $output_dir . $chh_ob; | 1703 $chh_ob = $output_dir . $chh_ob; |
1183 | 1704 |
1184 if ($gzip){ | 1705 if ($gzip){ |
1185 $chh_ob .= '.gz'; | 1706 $chh_ob .= '.gz'; |
1186 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n"; | 1707 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only); |
1187 } | 1708 } |
1188 else{ | 1709 else{ |
1189 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n"; | 1710 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only); |
1190 } | 1711 } |
1191 | 1712 |
1192 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n"; | 1713 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only); |
1193 push @sorting_files,$chh_ob; | 1714 push @sorting_files,$chh_ob; |
1194 | 1715 |
1195 unless($no_header){ | 1716 unless($no_header){ |
1196 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n"; | 1717 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); |
1197 } | 1718 } |
1198 } | 1719 } |
1199 | 1720 |
1200 my $methylation_call_strings_processed = 0; | 1721 my $methylation_call_strings_processed = 0; |
1201 my $line_count = 0; | 1722 my $line_count = 0; |
1325 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> | 1846 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> |
1326 if ($ignore) { | 1847 if ($ignore) { |
1327 # print "\n\n$meth_call\n"; | 1848 # print "\n\n$meth_call\n"; |
1328 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); | 1849 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); |
1329 # print "$meth_call\n"; | 1850 # print "$meth_call\n"; |
1851 | |
1330 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | 1852 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly |
1331 | 1853 |
1332 my @len = split (/\D+/,$cigar); # storing the length per operation | 1854 my @len = split (/\D+/,$cigar); # storing the length per operation |
1333 my @ops = split (/\d+/,$cigar); # storing the operation | 1855 my @ops = split (/\d+/,$cigar); # storing the operation |
1334 shift @ops; # remove the empty first element | 1856 shift @ops; # remove the empty first element |
1442 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | 1964 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; |
1443 } | 1965 } |
1444 | 1966 |
1445 if ($meth_call_1 and $meth_call_2) { | 1967 if ($meth_call_1 and $meth_call_2) { |
1446 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' | 1968 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' |
1969 | |
1447 if ($ignore) { | 1970 if ($ignore) { |
1448 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); | 1971 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); |
1449 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); | 1972 |
1450 | |
1451 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified | 1973 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified |
1452 $start_read_1 += $ignore; | 1974 $start_read_1 += $ignore; |
1453 $end_read_2 -= $ignore; | 1975 } |
1454 } | 1976 if ($ignore_r2) { |
1977 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); | |
1978 | |
1979 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified | |
1980 $end_read_2 -= $ignore_r2; | |
1981 } | |
1982 | |
1455 my $end_read_1; | 1983 my $end_read_1; |
1456 my $start_read_2; | 1984 my $start_read_2; |
1457 | 1985 |
1458 if ($strand eq '+') { | 1986 if ($strand eq '+') { |
1459 | 1987 |
1460 $end_read_1 = $start_read_1+length($meth_call_1)-1; | 1988 $end_read_1 = $start_read_1+length($meth_call_1)-1; |
1461 $start_read_2 = $end_read_2-length($meth_call_2)+1; | 1989 $start_read_2 = $end_read_2-length($meth_call_2)+1; |
1462 | 1990 |
1463 ## we first pass the first read which is in + orientation on the forward strand | 1991 ## we first pass the first read which is in + orientation on the forward strand |
1464 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0); | 1992 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity |
1465 | 1993 |
1466 # we next pass the second read which is in - orientation on the reverse strand | 1994 # we next pass the second read which is in - orientation on the reverse strand |
1467 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | 1995 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 |
1468 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1); | 1996 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2); |
1469 } else { | 1997 } |
1998 else { | |
1470 | 1999 |
1471 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! | 2000 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! |
1472 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! | 2001 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! |
1473 | 2002 |
1474 ## we first pass the first read which is in - orientation on the reverse strand | 2003 ## we first pass the first read which is in - orientation on the reverse strand |
1475 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0); | 2004 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1); |
1476 | 2005 |
1477 # we next pass the second read which is in + orientation on the forward strand | 2006 # we next pass the second read which is in + orientation on the forward strand |
1478 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | 2007 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 |
1479 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2); | 2008 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2); |
1480 } | 2009 } |
1481 | 2010 |
1482 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | 2011 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls |
1483 } | 2012 } |
1484 } | 2013 } |
1485 } else { # Bismark paired-end SAM output format (default) | 2014 } |
2015 else { # Bismark paired-end SAM output format (default) | |
1486 while (<IN>) { | 2016 while (<IN>) { |
1487 ### SAM format can either start with header lines (starting with @) or start with alignments directly | 2017 ### SAM format can either start with header lines (starting with @) or start with alignments directly |
1488 if (/^\@/) { # skipping header lines (starting with @) | 2018 if (/^\@/) { # skipping header lines (starting with @) |
1489 warn "skipping SAM header line:\t$_"; | 2019 warn "skipping SAM header line:\t$_"; |
1490 next; | 2020 next; |
1583 | 2113 |
1584 ### READ 1 | 2114 ### READ 1 |
1585 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation | 2115 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation |
1586 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation | 2116 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation |
1587 shift @ops_1; # remove the empty first element | 2117 shift @ops_1; # remove the empty first element |
1588 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); | 2118 |
2119 die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1); | |
1589 | 2120 |
1590 my @comp_cigar_1; # building an array with all CIGAR operations | 2121 my @comp_cigar_1; # building an array with all CIGAR operations |
1591 foreach my $index (0..$#len_1) { | 2122 foreach my $index (0..$#len_1) { |
1592 foreach (1..$len_1[$index]) { | 2123 foreach (1..$len_1[$index]) { |
1593 # print "$ops_1[$index]"; | 2124 # print "$ops_1[$index]"; |
1610 } | 2141 } |
1611 } | 2142 } |
1612 # print "original CIGAR read 2: $cigar_2\n"; | 2143 # print "original CIGAR read 2: $cigar_2\n"; |
1613 # print "original CIGAR read 2: @comp_cigar_2\n"; | 2144 # print "original CIGAR read 2: @comp_cigar_2\n"; |
1614 | 2145 |
2146 | |
2147 | |
1615 if ($ignore) { | 2148 if ($ignore) { |
1616 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' | 2149 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' for read 1 |
1617 ### the methylation calls have already been reversed where necessary | 2150 ### the methylation calls have already been reversed where necessary |
1618 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); | 2151 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); |
1619 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); | |
1620 | |
1621 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
1622 | 2152 |
1623 if ($strand eq '+') { | 2153 if ($strand eq '+') { |
1624 | 2154 |
1625 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start | 2155 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start |
1626 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions | 2156 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions |
1627 my $I_count_1 = 0; | 2157 my $I_count_1 = 0; |
1628 | 2158 |
1629 for (1..$ignore) { | 2159 for (1..$ignore) { |
1630 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start | 2160 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start |
1631 # print "$_ deleted $op\n"; | 2161 # print "$_ deleted $op\n"; |
1632 | 2162 |
1633 while ($op eq 'D') { # repeating this for deletions (D) | 2163 while ($op eq 'D') { # repeating this for deletions (D) |
1640 } | 2170 } |
1641 } | 2171 } |
1642 | 2172 |
1643 $start_read_1 += $ignore + $D_count_1 - $I_count_1; | 2173 $start_read_1 += $ignore + $D_count_1 - $I_count_1; |
1644 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; | 2174 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; |
1645 | 2175 |
1646 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back | |
1647 | |
1648 for (1..$ignore) { | |
1649 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
1650 while ($op eq 'D') { # repeating this for deletions (D) | |
1651 $op = pop @comp_cigar_2; | |
1652 } | |
1653 } | |
1654 # the start position of reads mapping to the reverse strand is being adjusted further below | 2176 # the start position of reads mapping to the reverse strand is being adjusted further below |
1655 } elsif ($strand eq '-') { | 2177 } |
2178 elsif ($strand eq '-') { | |
1656 | 2179 |
1657 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back | 2180 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back |
1658 for (1..$ignore) { | 2181 for (1..$ignore) { |
1659 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | 2182 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array |
1660 while ($op eq 'D') { # repeating this for deletions (D) | 2183 while ($op eq 'D') { # repeating this for deletions (D) |
1661 $op = pop @comp_cigar_1; | 2184 $op = pop @comp_cigar_1; |
1662 } | 2185 } |
1663 } | 2186 } |
1664 # the start position of reads mapping to the reverse strand is being adjusted further below | 2187 # the start position of reads mapping to the reverse strand is being adjusted further below |
1665 | 2188 |
2189 } | |
2190 } | |
2191 | |
2192 if ($ignore_r2) { | |
2193 ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore_r2 <int>' for read 2 | |
2194 ### the methylation calls have already been reversed where necessary | |
2195 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); | |
2196 | |
2197 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
2198 | |
2199 if ($strand eq '+') { | |
2200 | |
2201 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back | |
2202 | |
2203 for (1..$ignore_r2) { | |
2204 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
2205 while ($op eq 'D') { # repeating this for deletions (D) | |
2206 $op = pop @comp_cigar_2; | |
2207 } | |
2208 } | |
2209 # the start position of reads mapping to the reverse strand is being adjusted further below | |
2210 } | |
2211 elsif ($strand eq '-') { | |
2212 | |
1666 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start | 2213 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start |
1667 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions | 2214 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions |
1668 my $I_count_2 = 0; | 2215 my $I_count_2 = 0; |
1669 | 2216 |
1670 for (1..$ignore) { | 2217 for (1..$ignore_r2) { |
1671 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start | 2218 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start |
1672 # print "$_ deleted $op\n"; | 2219 # print "$_ deleted $op\n"; |
1673 | 2220 |
1674 while ($op eq 'D') { # repeating this for deletions (D) | 2221 while ($op eq 'D') { # repeating this for deletions (D) |
1675 $D_count_2++; | 2222 $D_count_2++; |
1679 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | 2226 if ($op eq 'I') { # adjusting the genomic position for insertions (I) |
1680 $I_count_2++; | 2227 $I_count_2++; |
1681 } | 2228 } |
1682 } | 2229 } |
1683 | 2230 |
1684 $start_read_2 += $ignore + $D_count_2 - $I_count_2; | 2231 $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2; |
1685 # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; | 2232 # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; |
1686 | 2233 } |
1687 } | 2234 } |
1688 | 2235 |
2236 if ($ignore){ | |
1689 ### reconstituting shortened CIGAR string 1 | 2237 ### reconstituting shortened CIGAR string 1 |
1690 my $new_cigar_1; | 2238 my $new_cigar_1; |
1691 my $count_1 = 0; | 2239 my $count_1 = 0; |
1692 my $last_op_1; | 2240 my $last_op_1; |
1693 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; | 2241 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; |
1706 } | 2254 } |
1707 } | 2255 } |
1708 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count | 2256 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count |
1709 $cigar_1 = $new_cigar_1; | 2257 $cigar_1 = $new_cigar_1; |
1710 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; | 2258 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; |
2259 } | |
2260 | |
2261 if ($ignore_r2){ | |
1711 | 2262 |
1712 ### reconstituting shortened CIGAR string 2 | 2263 ### reconstituting shortened CIGAR string 2 |
1713 my $new_cigar_2; | 2264 my $new_cigar_2; |
1714 my $count_2 = 0; | 2265 my $count_2 = 0; |
1715 my $last_op_2; | 2266 my $last_op_2; |
1720 ++$count_2; | 2271 ++$count_2; |
1721 next; | 2272 next; |
1722 } | 2273 } |
1723 if ($last_op_2 eq $op) { | 2274 if ($last_op_2 eq $op) { |
1724 ++$count_2; | 2275 ++$count_2; |
1725 } else { | 2276 } |
2277 else { | |
1726 $new_cigar_2 .= "$count_2$last_op_2"; | 2278 $new_cigar_2 .= "$count_2$last_op_2"; |
1727 $last_op_2 = $op; | 2279 $last_op_2 = $op; |
1728 $count_2 = 1; | 2280 $count_2 = 1; |
1729 } | 2281 } |
1730 } | 2282 } |
1731 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count | 2283 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count |
1732 $cigar_2 = $new_cigar_2; | 2284 $cigar_2 = $new_cigar_2; |
1733 # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n"; | 2285 # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n"; |
1734 | 2286 } |
1735 } | 2287 |
2288 ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on | |
1736 | 2289 |
1737 if ($strand eq '+') { | 2290 if ($strand eq '+') { |
1738 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 | 2291 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 |
1739 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | 2292 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too |
1740 # print "reverse: @comp_cigar_2\n"; | 2293 # print "reverse: @comp_cigar_2\n"; |
1749 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | 2302 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't |
1750 } | 2303 } |
1751 | 2304 |
1752 $end_read_1 = $start_read_1 + $MD_count_1 - 1; | 2305 $end_read_1 = $start_read_1 + $MD_count_1 - 1; |
1753 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand | 2306 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand |
1754 } else { | 2307 } |
2308 else { | |
1755 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 | 2309 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 |
1756 | 2310 |
1757 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | 2311 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too |
1758 # print "reverse: @comp_cigar_1\n"; | 2312 # print "reverse: @comp_cigar_1\n"; |
1759 | 2313 |
1762 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | 2316 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't |
1763 } | 2317 } |
1764 | 2318 |
1765 $end_read_1 = $start_read_1; | 2319 $end_read_1 = $start_read_1; |
1766 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand | 2320 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand |
1767 | |
1768 } | 2321 } |
1769 | 2322 |
1770 if ($strand eq '+') { | 2323 if ($strand eq '+') { |
1771 ## we first pass the first read which is in + orientation on the forward strand | 2324 ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity |
1772 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1); | 2325 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1); |
1773 | 2326 |
1774 # we next pass the second read which is in - orientation on the reverse strand | 2327 # we next pass the second read which is in - orientation on the reverse strand |
1775 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | 2328 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 |
1776 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2); | 2329 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2); |
1777 } else { | 2330 } else { |
1778 ## we first pass the first read which is in - orientation on the reverse strand | 2331 ## we first pass the first read which is in - orientation on the reverse strand |
1779 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1); | 2332 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1); |
1780 | 2333 |
1781 # we next pass the second read which is in + orientation on the forward strand | 2334 # we next pass the second read which is in + orientation on the forward strand |
1782 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | 2335 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 |
1783 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2); | 2336 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2); |
1784 } | 2337 } |
1785 | 2338 |
1786 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | 2339 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls |
1787 } | 2340 } |
1788 } | 2341 } |
1789 } | 2342 } |
1790 } else { | 2343 } else { |
1791 die "Single-end or paired-end reads not specified properly\n"; | 2344 die "Single-end or paired-end reads not specified properly\n"; |
1792 } | 2345 } |
1793 | 2346 |
1794 print "\n\nProcessed $line_count lines from $filename in total\n"; | 2347 warn "\n\nProcessed $line_count lines from $filename in total\n"; |
1795 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; | 2348 warn "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; |
1796 if ($report) { | 2349 if ($report) { |
2350 print REPORT "\n\nProcessed $line_count lines from $filename in total\n"; | |
1797 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; | 2351 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; |
1798 } | 2352 } |
1799 print_splitting_report (); | 2353 print_splitting_report (); |
1800 } | 2354 } |
1801 | 2355 |
1932 } | 2486 } |
1933 } | 2487 } |
1934 | 2488 |
1935 | 2489 |
1936 | 2490 |
1937 | |
1938 | |
1939 sub print_individual_C_methylation_states_paired_end_files{ | 2491 sub print_individual_C_methylation_states_paired_end_files{ |
1940 | 2492 |
1941 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_; | 2493 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_; |
2494 | |
2495 ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2 | |
2496 die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2); | |
2497 | |
2498 my @methylation_calls = split(//,$meth_call); | |
2499 | |
2500 ################################################################# | |
2501 ### . for bases not involving cytosines ### | |
2502 ### X for methylated C in CHG context (was protected) ### | |
2503 ### x for not methylated C in CHG context (was converted) ### | |
2504 ### H for methylated C in CHH context (was protected) ### | |
2505 ### h for not methylated C in CHH context (was converted) ### | |
2506 ### Z for methylated C in CpG context (was protected) ### | |
2507 ### z for not methylated C in CpG context (was converted) ### | |
2508 ### U for methylated C in Unknown context (was protected) ### | |
2509 ### u for not methylated C in Unknown context (was converted) ### | |
2510 ################################################################# | |
2511 | |
2512 my $methyl_CHG_count = 0; | |
2513 my $methyl_CHH_count = 0; | |
2514 my $methyl_CpG_count = 0; | |
2515 my $unmethylated_CHG_count = 0; | |
2516 my $unmethylated_CHH_count = 0; | |
2517 my $unmethylated_CpG_count = 0; | |
2518 | |
2519 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | |
2520 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | |
2521 my @comp_cigar; | |
2522 | |
2523 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing | |
2524 if ($cigar =~ /^\d+M$/){ | |
2525 # this check speeds up the extraction process by up to 60%!!! | |
2526 } | |
2527 else{ # parsing CIGAR string | |
2528 my @len; | |
2529 my @ops; | |
2530 @len = split (/\D+/,$cigar); # storing the length per operation | |
2531 @ops = split (/\d+/,$cigar); # storing the operation | |
2532 shift @ops; # remove the empty first element | |
2533 | |
2534 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
2535 | |
2536 foreach my $index (0..$#len){ | |
2537 foreach (1..$len[$index]){ | |
2538 # print "$ops[$index]"; | |
2539 push @comp_cigar, $ops[$index]; | |
2540 } | |
2541 } | |
2542 # warn "\nDetected CIGAR string: $cigar\n"; | |
2543 # warn "Length of methylation call: ",length $meth_call,"\n"; | |
2544 # warn "number of operations: ",scalar @ops,"\n"; | |
2545 # warn "number of length digits: ",scalar @len,"\n\n"; | |
2546 # print @comp_cigar,"\n"; | |
2547 # print "$meth_call\n\n"; | |
2548 # sleep (1); | |
2549 } | |
2550 | |
2551 if ($strand eq '-') { | |
2552 | |
2553 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above | |
2554 if (@comp_cigar){ | |
2555 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
2556 } | |
2557 # print "reverse CIGAR string: @comp_cigar\n"; | |
2558 | |
2559 ### the start position of paired-end files has already been corrected, see above | |
2560 } | |
2561 | |
2562 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified | |
2563 | |
2564 if ($merge_non_CpG) { | |
2565 if ($no_overlap) { # this has to be read 2... | |
2566 | |
2567 ### single-file CpG and non-CpG context output | |
2568 if ($full) { | |
2569 if ($strand eq '+') { | |
2570 for my $index (0..$#methylation_calls) { | |
2571 | |
2572 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2573 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2574 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2575 $cigar_offset += $cigar_mod; | |
2576 $pos_offset += $pos_mod; | |
2577 } | |
2578 | |
2579 ### Returning as soon as the methylation calls start overlapping | |
2580 if ($start+$index+$pos_offset >= $end_read_1) { | |
2581 return; | |
2582 } | |
2583 | |
2584 if ($methylation_calls[$index] eq 'X') { | |
2585 $counting{total_meCHG_count}++; | |
2586 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2587 if ($read_identity == 1){ | |
2588 $mbias_1{CHG}->{$index+1}->{meth}++; | |
2589 } | |
2590 else{ | |
2591 $mbias_2{CHG}->{$index+1}->{meth}++; | |
2592 } | |
2593 } | |
2594 elsif ($methylation_calls[$index] eq 'x') { | |
2595 $counting{total_unmethylated_CHG_count}++; | |
2596 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2597 if ($read_identity == 1){ | |
2598 $mbias_1{CHG}->{$index+1}->{un}++; | |
2599 } | |
2600 else{ | |
2601 $mbias_2{CHG}->{$index+1}->{un}++; | |
2602 } | |
2603 } | |
2604 elsif ($methylation_calls[$index] eq 'Z') { | |
2605 $counting{total_meCpG_count}++; | |
2606 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2607 if ($read_identity == 1){ | |
2608 $mbias_1{CpG}->{$index+1}->{meth}++; | |
2609 } | |
2610 else{ | |
2611 $mbias_2{CpG}->{$index+1}->{meth}++; | |
2612 } | |
2613 } | |
2614 elsif ($methylation_calls[$index] eq 'z') { | |
2615 $counting{total_unmethylated_CpG_count}++; | |
2616 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2617 if ($read_identity == 1){ | |
2618 $mbias_1{CpG}->{$index+1}->{un}++; | |
2619 } | |
2620 else{ | |
2621 $mbias_2{CpG}->{$index+1}->{un}++; | |
2622 } | |
2623 } | |
2624 elsif ($methylation_calls[$index] eq 'H') { | |
2625 $counting{total_meCHH_count}++; | |
2626 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2627 if ($read_identity == 1){ | |
2628 $mbias_1{CHH}->{$index+1}->{meth}++; | |
2629 } | |
2630 else{ | |
2631 $mbias_2{CHH}->{$index+1}->{meth}++; | |
2632 } | |
2633 } | |
2634 elsif ($methylation_calls[$index] eq 'h') { | |
2635 $counting{total_unmethylated_CHH_count}++; | |
2636 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2637 if ($read_identity == 1){ | |
2638 $mbias_1{CHH}->{$index+1}->{un}++; | |
2639 } | |
2640 else{ | |
2641 $mbias_2{CHH}->{$index+1}->{un}++; | |
2642 } | |
2643 } | |
2644 elsif ($methylation_calls[$index] eq '.'){} | |
2645 elsif (lc$methylation_calls[$index] eq 'u'){} | |
2646 else{ | |
2647 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
2648 } | |
2649 } | |
2650 } | |
2651 elsif ($strand eq '-') { | |
2652 for my $index (0..$#methylation_calls) { | |
2653 | |
2654 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2655 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2656 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2657 $cigar_offset += $cigar_mod; | |
2658 $pos_offset += $pos_mod; | |
2659 } | |
2660 | |
2661 ### Returning as soon as the methylation calls start overlapping | |
2662 if ($start-$index+$pos_offset <= $end_read_1) { | |
2663 return; | |
2664 } | |
2665 | |
2666 if ($methylation_calls[$index] eq 'X') { | |
2667 $counting{total_meCHG_count}++; | |
2668 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2669 if ($read_identity == 1){ | |
2670 $mbias_1{CHG}->{$index+1}->{meth}++; | |
2671 } | |
2672 else{ | |
2673 $mbias_2{CHG}->{$index+1}->{meth}++; | |
2674 } | |
2675 } | |
2676 elsif ($methylation_calls[$index] eq 'x') { | |
2677 $counting{total_unmethylated_CHG_count}++; | |
2678 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2679 if ($read_identity == 1){ | |
2680 $mbias_1{CHG}->{$index+1}->{un}++; | |
2681 } | |
2682 else{ | |
2683 $mbias_2{CHG}->{$index+1}->{un}++; | |
2684 } | |
2685 } | |
2686 elsif ($methylation_calls[$index] eq 'Z') { | |
2687 $counting{total_meCpG_count}++; | |
2688 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2689 if ($read_identity == 1){ | |
2690 $mbias_1{CpG}->{$index+1}->{meth}++; | |
2691 } | |
2692 else{ | |
2693 $mbias_2{CpG}->{$index+1}->{meth}++; | |
2694 } | |
2695 } | |
2696 elsif ($methylation_calls[$index] eq 'z') { | |
2697 $counting{total_unmethylated_CpG_count}++; | |
2698 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2699 if ($read_identity == 1){ | |
2700 $mbias_1{CpG}->{$index+1}->{un}++; | |
2701 } | |
2702 else{ | |
2703 $mbias_2{CpG}->{$index+1}->{un}++; | |
2704 } | |
2705 } | |
2706 elsif ($methylation_calls[$index] eq 'H') { | |
2707 $counting{total_meCHH_count}++; | |
2708 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2709 if ($read_identity == 1){ | |
2710 $mbias_1{CHH}->{$index+1}->{meth}++; | |
2711 } | |
2712 else{ | |
2713 $mbias_2{CHH}->{$index+1}->{meth}++; | |
2714 } | |
2715 } | |
2716 elsif ($methylation_calls[$index] eq 'h') { | |
2717 $counting{total_unmethylated_CHH_count}++; | |
2718 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2719 if ($read_identity == 1){ | |
2720 $mbias_1{CHH}->{$index+1}->{un}++; | |
2721 } | |
2722 else{ | |
2723 $mbias_2{CHH}->{$index+1}->{un}++; | |
2724 } | |
2725 } | |
2726 elsif ($methylation_calls[$index] eq '.') {} | |
2727 elsif (lc$methylation_calls[$index] eq 'u'){} | |
2728 else{ | |
2729 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
2730 } | |
2731 } | |
2732 } else { | |
2733 die "The read orientation was neither + nor -: '$strand'\n"; | |
2734 } | |
2735 } | |
2736 | |
2737 ### strand-specific methylation output | |
2738 else { | |
2739 if ($strand eq '+') { | |
2740 for my $index (0..$#methylation_calls) { | |
2741 | |
2742 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2743 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2744 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2745 $cigar_offset += $cigar_mod; | |
2746 $pos_offset += $pos_mod; | |
2747 } | |
2748 | |
2749 ### Returning as soon as the methylation calls start overlapping | |
2750 if ($start+$index+$pos_offset >= $end_read_1) { | |
2751 return; | |
2752 } | |
2753 | |
2754 if ($methylation_calls[$index] eq 'X') { | |
2755 $counting{total_meCHG_count}++; | |
2756 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2757 if ($read_identity == 1){ | |
2758 $mbias_1{CHG}->{$index+1}->{meth}++; | |
2759 } | |
2760 else{ | |
2761 $mbias_2{CHG}->{$index+1}->{meth}++; | |
2762 } | |
2763 } | |
2764 elsif ($methylation_calls[$index] eq 'x') { | |
2765 $counting{total_unmethylated_CHG_count}++; | |
2766 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2767 if ($read_identity == 1){ | |
2768 $mbias_1{CHG}->{$index+1}->{un}++; | |
2769 } | |
2770 else{ | |
2771 $mbias_2{CHG}->{$index+1}->{un}++; | |
2772 } | |
2773 } | |
2774 elsif ($methylation_calls[$index] eq 'Z') { | |
2775 $counting{total_meCpG_count}++; | |
2776 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2777 if ($read_identity == 1){ | |
2778 $mbias_1{CpG}->{$index+1}->{meth}++; | |
2779 } | |
2780 else{ | |
2781 $mbias_2{CpG}->{$index+1}->{meth}++; | |
2782 } | |
2783 } | |
2784 elsif ($methylation_calls[$index] eq 'z') { | |
2785 $counting{total_unmethylated_CpG_count}++; | |
2786 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2787 if ($read_identity == 1){ | |
2788 $mbias_1{CpG}->{$index+1}->{un}++; | |
2789 } | |
2790 else{ | |
2791 $mbias_2{CpG}->{$index+1}->{un}++; | |
2792 } | |
2793 } | |
2794 elsif ($methylation_calls[$index] eq 'H') { | |
2795 $counting{total_meCHH_count}++; | |
2796 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2797 if ($read_identity == 1){ | |
2798 $mbias_1{CHH}->{$index+1}->{meth}++; | |
2799 } | |
2800 else{ | |
2801 $mbias_2{CHH}->{$index+1}->{meth}++; | |
2802 } | |
2803 } | |
2804 elsif ($methylation_calls[$index] eq 'h') { | |
2805 $counting{total_unmethylated_CHH_count}++; | |
2806 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2807 if ($read_identity == 1){ | |
2808 $mbias_1{CHH}->{$index+1}->{un}++; | |
2809 } | |
2810 else{ | |
2811 $mbias_2{CHH}->{$index+1}->{un}++; | |
2812 } | |
2813 } | |
2814 elsif ($methylation_calls[$index] eq '.') {} | |
2815 elsif (lc$methylation_calls[$index] eq 'u'){} | |
2816 else{ | |
2817 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2818 } | |
2819 } | |
2820 } elsif ($strand eq '-') { | |
2821 for my $index (0..$#methylation_calls) { | |
2822 | |
2823 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2824 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2825 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2826 $cigar_offset += $cigar_mod; | |
2827 $pos_offset += $pos_mod; | |
2828 } | |
2829 | |
2830 ### Returning as soon as the methylation calls start overlapping | |
2831 if ($start-$index+$pos_offset <= $end_read_1) { | |
2832 return; | |
2833 } | |
2834 | |
2835 if ($methylation_calls[$index] eq 'X') { | |
2836 $counting{total_meCHG_count}++; | |
2837 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2838 if ($read_identity == 1){ | |
2839 $mbias_1{CHG}->{$index+1}->{meth}++; | |
2840 } | |
2841 else{ | |
2842 $mbias_2{CHG}->{$index+1}->{meth}++; | |
2843 } | |
2844 } | |
2845 elsif ($methylation_calls[$index] eq 'x') { | |
2846 $counting{total_unmethylated_CHG_count}++; | |
2847 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2848 if ($read_identity == 1){ | |
2849 $mbias_1{CHG}->{$index+1}->{un}++; | |
2850 } | |
2851 else{ | |
2852 $mbias_2{CHG}->{$index+1}->{un}++; | |
2853 } | |
2854 } | |
2855 elsif ($methylation_calls[$index] eq 'Z') { | |
2856 $counting{total_meCpG_count}++; | |
2857 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2858 if ($read_identity == 1){ | |
2859 $mbias_1{CpG}->{$index+1}->{meth}++; | |
2860 } | |
2861 else{ | |
2862 $mbias_2{CpG}->{$index+1}->{meth}++; | |
2863 } | |
2864 } | |
2865 elsif ($methylation_calls[$index] eq 'z') { | |
2866 $counting{total_unmethylated_CpG_count}++; | |
2867 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2868 if ($read_identity == 1){ | |
2869 $mbias_1{CpG}->{$index+1}->{un}++; | |
2870 } | |
2871 else{ | |
2872 $mbias_2{CpG}->{$index+1}->{un}++; | |
2873 } | |
2874 } | |
2875 elsif ($methylation_calls[$index] eq 'H') { | |
2876 $counting{total_meCHH_count}++; | |
2877 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2878 if ($read_identity == 1){ | |
2879 $mbias_1{CHH}->{$index+1}->{meth}++; | |
2880 } | |
2881 else{ | |
2882 $mbias_2{CHH}->{$index+1}->{meth}++; | |
2883 } | |
2884 } | |
2885 elsif ($methylation_calls[$index] eq 'h') { | |
2886 $counting{total_unmethylated_CHH_count}++; | |
2887 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2888 if ($read_identity == 1){ | |
2889 $mbias_1{CHH}->{$index+1}->{un}++; | |
2890 } | |
2891 else{ | |
2892 $mbias_2{CHH}->{$index+1}->{un}++; | |
2893 } | |
2894 } | |
2895 elsif ($methylation_calls[$index] eq '.') {} | |
2896 elsif (lc$methylation_calls[$index] eq 'u'){} | |
2897 else{ | |
2898 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2899 } | |
2900 } | |
2901 } else { | |
2902 die "The strand orientation was neither + nor -: '$strand'/n"; | |
2903 } | |
2904 } | |
2905 } | |
2906 | |
2907 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
2908 ### Still within the 2-CONTEXT ONLY optional section | |
2909 else { | |
2910 ### single-file CpG and non-CpG context output | |
2911 if ($full) { | |
2912 if ($strand eq '+') { | |
2913 for my $index (0..$#methylation_calls) { | |
2914 | |
2915 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2916 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2917 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2918 $cigar_offset += $cigar_mod; | |
2919 $pos_offset += $pos_mod; | |
2920 } | |
2921 | |
2922 if ($methylation_calls[$index] eq 'X') { | |
2923 $counting{total_meCHG_count}++; | |
2924 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2925 if ($read_identity == 1){ | |
2926 $mbias_1{CHG}->{$index+1}->{meth}++; | |
2927 } | |
2928 else{ | |
2929 $mbias_2{CHG}->{$index+1}->{meth}++; | |
2930 } | |
2931 } | |
2932 elsif ($methylation_calls[$index] eq 'x') { | |
2933 $counting{total_unmethylated_CHG_count}++; | |
2934 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2935 if ($read_identity == 1){ | |
2936 $mbias_1{CHG}->{$index+1}->{un}++; | |
2937 } | |
2938 else{ | |
2939 $mbias_2{CHG}->{$index+1}->{un}++; | |
2940 } | |
2941 } | |
2942 elsif ($methylation_calls[$index] eq 'Z') { | |
2943 $counting{total_meCpG_count}++; | |
2944 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2945 if ($read_identity == 1){ | |
2946 $mbias_1{CpG}->{$index+1}->{meth}++; | |
2947 } | |
2948 else{ | |
2949 $mbias_2{CpG}->{$index+1}->{meth}++; | |
2950 } | |
2951 } | |
2952 elsif ($methylation_calls[$index] eq 'z') { | |
2953 $counting{total_unmethylated_CpG_count}++; | |
2954 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2955 if ($read_identity == 1){ | |
2956 $mbias_1{CpG}->{$index+1}->{un}++; | |
2957 } | |
2958 else{ | |
2959 $mbias_2{CpG}->{$index+1}->{un}++; | |
2960 } | |
2961 } | |
2962 elsif ($methylation_calls[$index] eq 'H') { | |
2963 $counting{total_meCHH_count}++; | |
2964 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2965 if ($read_identity == 1){ | |
2966 $mbias_1{CHH}->{$index+1}->{meth}++; | |
2967 } | |
2968 else{ | |
2969 $mbias_2{CHH}->{$index+1}->{meth}++; | |
2970 } | |
2971 } | |
2972 elsif ($methylation_calls[$index] eq 'h') { | |
2973 $counting{total_unmethylated_CHH_count}++; | |
2974 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
2975 if ($read_identity == 1){ | |
2976 $mbias_1{CHH}->{$index+1}->{un}++; | |
2977 } | |
2978 else{ | |
2979 $mbias_2{CHH}->{$index+1}->{un}++; | |
2980 } | |
2981 } | |
2982 elsif ($methylation_calls[$index] eq '.') {} | |
2983 elsif (lc$methylation_calls[$index] eq 'u'){} | |
2984 else{ | |
2985 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
2986 } | |
2987 } | |
2988 } elsif ($strand eq '-') { | |
2989 for my $index (0..$#methylation_calls) { | |
2990 | |
2991 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2992 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2993 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2994 $cigar_offset += $cigar_mod; | |
2995 $pos_offset += $pos_mod; | |
2996 } | |
2997 | |
2998 if ($methylation_calls[$index] eq 'X') { | |
2999 $counting{total_meCHG_count}++; | |
3000 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3001 if ($read_identity == 1){ | |
3002 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3003 } | |
3004 else{ | |
3005 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3006 } | |
3007 } | |
3008 elsif ($methylation_calls[$index] eq 'x') { | |
3009 $counting{total_unmethylated_CHG_count}++; | |
3010 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3011 if ($read_identity == 1){ | |
3012 $mbias_1{CHG}->{$index+1}->{un}++; | |
3013 } | |
3014 else{ | |
3015 $mbias_2{CHG}->{$index+1}->{un}++; | |
3016 } | |
3017 } | |
3018 elsif ($methylation_calls[$index] eq 'Z') { | |
3019 $counting{total_meCpG_count}++; | |
3020 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3021 if ($read_identity == 1){ | |
3022 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3023 } | |
3024 else{ | |
3025 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3026 } | |
3027 } | |
3028 elsif ($methylation_calls[$index] eq 'z') { | |
3029 $counting{total_unmethylated_CpG_count}++; | |
3030 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3031 if ($read_identity == 1){ | |
3032 $mbias_1{CpG}->{$index+1}->{un}++; | |
3033 } | |
3034 else{ | |
3035 $mbias_2{CpG}->{$index+1}->{un}++; | |
3036 } | |
3037 } | |
3038 elsif ($methylation_calls[$index] eq 'H') { | |
3039 $counting{total_meCHH_count}++; | |
3040 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3041 if ($read_identity == 1){ | |
3042 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3043 } | |
3044 else{ | |
3045 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3046 } | |
3047 } | |
3048 elsif ($methylation_calls[$index] eq 'h') { | |
3049 $counting{total_unmethylated_CHH_count}++; | |
3050 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3051 if ($read_identity == 1){ | |
3052 $mbias_1{CHH}->{$index+1}->{un}++; | |
3053 } | |
3054 else{ | |
3055 $mbias_2{CHH}->{$index+1}->{un}++; | |
3056 } | |
3057 } | |
3058 elsif ($methylation_calls[$index] eq '.') {} | |
3059 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3060 else{ | |
3061 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
3062 } | |
3063 } | |
3064 } else { | |
3065 die "The strand orientation as neither + nor -: '$strand'\n"; | |
3066 } | |
3067 } | |
3068 | |
3069 ### strand-specific methylation output | |
3070 ### still within the 2-CONTEXT optional section | |
3071 else { | |
3072 if ($strand eq '+') { | |
3073 for my $index (0..$#methylation_calls) { | |
3074 | |
3075 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3076 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3077 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
3078 $cigar_offset += $cigar_mod; | |
3079 $pos_offset += $pos_mod; | |
3080 } | |
3081 | |
3082 if ($methylation_calls[$index] eq 'X') { | |
3083 $counting{total_meCHG_count}++; | |
3084 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3085 if ($read_identity == 1){ | |
3086 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3087 } | |
3088 else{ | |
3089 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3090 } | |
3091 } | |
3092 elsif ($methylation_calls[$index] eq 'x') { | |
3093 $counting{total_unmethylated_CHG_count}++; | |
3094 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3095 if ($read_identity == 1){ | |
3096 $mbias_1{CHG}->{$index+1}->{un}++; | |
3097 } | |
3098 else{ | |
3099 $mbias_2{CHG}->{$index+1}->{un}++; | |
3100 } | |
3101 } | |
3102 elsif ($methylation_calls[$index] eq 'Z') { | |
3103 $counting{total_meCpG_count}++; | |
3104 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3105 if ($read_identity == 1){ | |
3106 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3107 } | |
3108 else{ | |
3109 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3110 } | |
3111 } | |
3112 elsif ($methylation_calls[$index] eq 'z') { | |
3113 $counting{total_unmethylated_CpG_count}++; | |
3114 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3115 if ($read_identity == 1){ | |
3116 $mbias_1{CpG}->{$index+1}->{un}++; | |
3117 } | |
3118 else{ | |
3119 $mbias_2{CpG}->{$index+1}->{un}++; | |
3120 } | |
3121 } | |
3122 elsif ($methylation_calls[$index] eq 'H') { | |
3123 $counting{total_meCHH_count}++; | |
3124 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3125 if ($read_identity == 1){ | |
3126 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3127 } | |
3128 else{ | |
3129 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3130 } | |
3131 } | |
3132 elsif ($methylation_calls[$index] eq 'h') { | |
3133 $counting{total_unmethylated_CHH_count}++; | |
3134 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3135 if ($read_identity == 1){ | |
3136 $mbias_1{CHH}->{$index+1}->{un}++; | |
3137 } | |
3138 else{ | |
3139 $mbias_2{CHH}->{$index+1}->{un}++; | |
3140 } | |
3141 } | |
3142 elsif ($methylation_calls[$index] eq '.') {} | |
3143 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3144 else{ | |
3145 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3146 } | |
3147 } | |
3148 } elsif ($strand eq '-') { | |
3149 for my $index (0..$#methylation_calls) { | |
3150 | |
3151 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3152 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
3153 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3154 $cigar_offset += $cigar_mod; | |
3155 $pos_offset += $pos_mod; | |
3156 } | |
3157 | |
3158 if ($methylation_calls[$index] eq 'X') { | |
3159 $counting{total_meCHG_count}++; | |
3160 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3161 if ($read_identity == 1){ | |
3162 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3163 } | |
3164 else{ | |
3165 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3166 } | |
3167 } | |
3168 elsif ($methylation_calls[$index] eq 'x') { | |
3169 $counting{total_unmethylated_CHG_count}++; | |
3170 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3171 if ($read_identity == 1){ | |
3172 $mbias_1{CHG}->{$index+1}->{un}++; | |
3173 } | |
3174 else{ | |
3175 $mbias_2{CHG}->{$index+1}->{un}++; | |
3176 } | |
3177 } | |
3178 elsif ($methylation_calls[$index] eq 'Z') { | |
3179 $counting{total_meCpG_count}++; | |
3180 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3181 if ($read_identity == 1){ | |
3182 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3183 } | |
3184 else{ | |
3185 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3186 } | |
3187 } | |
3188 elsif ($methylation_calls[$index] eq 'z') { | |
3189 $counting{total_unmethylated_CpG_count}++; | |
3190 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3191 if ($read_identity == 1){ | |
3192 $mbias_1{CpG}->{$index+1}->{un}++; | |
3193 } | |
3194 else{ | |
3195 $mbias_2{CpG}->{$index+1}->{un}++; | |
3196 } | |
3197 } | |
3198 elsif ($methylation_calls[$index] eq 'H') { | |
3199 $counting{total_meCHH_count}++; | |
3200 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3201 if ($read_identity == 1){ | |
3202 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3203 } | |
3204 else{ | |
3205 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3206 } | |
3207 } | |
3208 elsif ($methylation_calls[$index] eq 'h') { | |
3209 $counting{total_unmethylated_CHH_count}++; | |
3210 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3211 if ($read_identity == 1){ | |
3212 $mbias_1{CHH}->{$index+1}->{un}++; | |
3213 } | |
3214 else{ | |
3215 $mbias_2{CHH}->{$index+1}->{un}++; | |
3216 } | |
3217 } | |
3218 elsif ($methylation_calls[$index] eq '.') {} | |
3219 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3220 else{ | |
3221 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3222 } | |
3223 } | |
3224 } else { | |
3225 die "The strand orientation as neither + nor -: '$strand'\n"; | |
3226 } | |
3227 } | |
3228 } | |
3229 } | |
3230 | |
3231 ############################################ | |
3232 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### | |
3233 ############################################ | |
3234 | |
3235 elsif ($no_overlap) { | |
3236 ### single-file CpG, CHG and CHH context output | |
3237 if ($full) { | |
3238 if ($strand eq '+') { | |
3239 for my $index (0..$#methylation_calls) { | |
3240 | |
3241 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3242 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3243 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
3244 $cigar_offset += $cigar_mod; | |
3245 $pos_offset += $pos_mod; | |
3246 } | |
3247 | |
3248 ### Returning as soon as the methylation calls start overlapping | |
3249 if ($start+$index+$pos_offset >= $end_read_1) { | |
3250 return; | |
3251 } | |
3252 | |
3253 if ($methylation_calls[$index] eq 'X') { | |
3254 $counting{total_meCHG_count}++; | |
3255 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3256 if ($read_identity == 1){ | |
3257 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3258 } | |
3259 else{ | |
3260 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3261 } | |
3262 } | |
3263 elsif ($methylation_calls[$index] eq 'x') { | |
3264 $counting{total_unmethylated_CHG_count}++; | |
3265 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3266 if ($read_identity == 1){ | |
3267 $mbias_1{CHG}->{$index+1}->{un}++; | |
3268 } | |
3269 else{ | |
3270 $mbias_2{CHG}->{$index+1}->{un}++; | |
3271 } | |
3272 } | |
3273 elsif ($methylation_calls[$index] eq 'Z') { | |
3274 $counting{total_meCpG_count}++; | |
3275 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3276 if ($read_identity == 1){ | |
3277 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3278 } | |
3279 else{ | |
3280 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3281 } | |
3282 } | |
3283 elsif ($methylation_calls[$index] eq 'z') { | |
3284 $counting{total_unmethylated_CpG_count}++; | |
3285 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3286 if ($read_identity == 1){ | |
3287 $mbias_1{CpG}->{$index+1}->{un}++; | |
3288 } | |
3289 else{ | |
3290 $mbias_2{CpG}->{$index+1}->{un}++; | |
3291 } | |
3292 } | |
3293 elsif ($methylation_calls[$index] eq 'H') { | |
3294 $counting{total_meCHH_count}++; | |
3295 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3296 if ($read_identity == 1){ | |
3297 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3298 } | |
3299 else{ | |
3300 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3301 } | |
3302 } | |
3303 elsif ($methylation_calls[$index] eq 'h') { | |
3304 $counting{total_unmethylated_CHH_count}++; | |
3305 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3306 if ($read_identity == 1){ | |
3307 $mbias_1{CHH}->{$index+1}->{un}++; | |
3308 } | |
3309 else{ | |
3310 $mbias_2{CHH}->{$index+1}->{un}++; | |
3311 } | |
3312 } | |
3313 elsif ($methylation_calls[$index] eq '.') {} | |
3314 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3315 else{ | |
3316 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3317 } | |
3318 } | |
3319 } elsif ($strand eq '-') { | |
3320 for my $index (0..$#methylation_calls) { | |
3321 | |
3322 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3323 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
3324 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3325 $cigar_offset += $cigar_mod; | |
3326 $pos_offset += $pos_mod; | |
3327 } | |
3328 | |
3329 ### Returning as soon as the methylation calls start overlapping | |
3330 if ($start-$index+$pos_offset <= $end_read_1) { | |
3331 return; | |
3332 } | |
3333 | |
3334 if ($methylation_calls[$index] eq 'X') { | |
3335 $counting{total_meCHG_count}++; | |
3336 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3337 if ($read_identity == 1){ | |
3338 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3339 } | |
3340 else{ | |
3341 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3342 } | |
3343 } | |
3344 elsif ($methylation_calls[$index] eq 'x') { | |
3345 $counting{total_unmethylated_CHG_count}++; | |
3346 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3347 if ($read_identity == 1){ | |
3348 $mbias_1{CHG}->{$index+1}->{un}++; | |
3349 } | |
3350 else{ | |
3351 $mbias_2{CHG}->{$index+1}->{un}++; | |
3352 } | |
3353 } | |
3354 elsif ($methylation_calls[$index] eq 'Z') { | |
3355 $counting{total_meCpG_count}++; | |
3356 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3357 if ($read_identity == 1){ | |
3358 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3359 } | |
3360 else{ | |
3361 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3362 } | |
3363 } | |
3364 elsif ($methylation_calls[$index] eq 'z') { | |
3365 $counting{total_unmethylated_CpG_count}++; | |
3366 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3367 if ($read_identity == 1){ | |
3368 $mbias_1{CpG}->{$index+1}->{un}++; | |
3369 } | |
3370 else{ | |
3371 $mbias_2{CpG}->{$index+1}->{un}++; | |
3372 } | |
3373 } | |
3374 elsif ($methylation_calls[$index] eq 'H') { | |
3375 $counting{total_meCHH_count}++; | |
3376 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3377 if ($read_identity == 1){ | |
3378 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3379 } | |
3380 else{ | |
3381 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3382 } | |
3383 } | |
3384 elsif ($methylation_calls[$index] eq 'h') { | |
3385 $counting{total_unmethylated_CHH_count}++; | |
3386 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3387 if ($read_identity == 1){ | |
3388 $mbias_1{CHH}->{$index+1}->{un}++; | |
3389 } | |
3390 else{ | |
3391 $mbias_2{CHH}->{$index+1}->{un}++; | |
3392 } | |
3393 } | |
3394 elsif ($methylation_calls[$index] eq '.') {} | |
3395 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3396 else{ | |
3397 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3398 } | |
3399 } | |
3400 } else { | |
3401 die "The strand orientation as neither + nor -: '$strand'\n"; | |
3402 } | |
3403 } | |
3404 | |
3405 ### strand-specific methylation output | |
3406 else { | |
3407 if ($strand eq '+') { | |
3408 for my $index (0..$#methylation_calls) { | |
3409 | |
3410 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3411 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3412 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
3413 $cigar_offset += $cigar_mod; | |
3414 $pos_offset += $pos_mod; | |
3415 } | |
3416 | |
3417 ### Returning as soon as the methylation calls start overlapping | |
3418 if ($start+$index+$pos_offset >= $end_read_1) { | |
3419 return; | |
3420 } | |
3421 | |
3422 if ($methylation_calls[$index] eq 'X') { | |
3423 $counting{total_meCHG_count}++; | |
3424 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3425 if ($read_identity == 1){ | |
3426 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3427 } | |
3428 else{ | |
3429 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3430 } | |
3431 } | |
3432 elsif ($methylation_calls[$index] eq 'x') { | |
3433 $counting{total_unmethylated_CHG_count}++; | |
3434 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3435 if ($read_identity == 1){ | |
3436 $mbias_1{CHG}->{$index+1}->{un}++; | |
3437 } | |
3438 else{ | |
3439 $mbias_2{CHG}->{$index+1}->{un}++; | |
3440 } | |
3441 } | |
3442 elsif ($methylation_calls[$index] eq 'Z') { | |
3443 $counting{total_meCpG_count}++; | |
3444 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3445 if ($read_identity == 1){ | |
3446 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3447 } | |
3448 else{ | |
3449 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3450 } | |
3451 } | |
3452 elsif ($methylation_calls[$index] eq 'z') { | |
3453 $counting{total_unmethylated_CpG_count}++; | |
3454 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3455 if ($read_identity == 1){ | |
3456 $mbias_1{CpG}->{$index+1}->{un}++; | |
3457 } | |
3458 else{ | |
3459 $mbias_2{CpG}->{$index+1}->{un}++; | |
3460 } | |
3461 } | |
3462 elsif ($methylation_calls[$index] eq 'H') { | |
3463 $counting{total_meCHH_count}++; | |
3464 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3465 if ($read_identity == 1){ | |
3466 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3467 } | |
3468 else{ | |
3469 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3470 } | |
3471 } | |
3472 elsif ($methylation_calls[$index] eq 'h') { | |
3473 $counting{total_unmethylated_CHH_count}++; | |
3474 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3475 if ($read_identity == 1){ | |
3476 $mbias_1{CHH}->{$index+1}->{un}++; | |
3477 } | |
3478 else{ | |
3479 $mbias_2{CHH}->{$index+1}->{un}++; | |
3480 } | |
3481 } | |
3482 elsif ($methylation_calls[$index] eq '.') {} | |
3483 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3484 else{ | |
3485 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3486 } | |
3487 } | |
3488 } elsif ($strand eq '-') { | |
3489 for my $index (0..$#methylation_calls) { | |
3490 | |
3491 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3492 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
3493 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3494 $cigar_offset += $cigar_mod; | |
3495 $pos_offset += $pos_mod; | |
3496 } | |
3497 | |
3498 ### Returning as soon as the methylation calls start overlapping | |
3499 if ($start-$index+$pos_offset <= $end_read_1) { | |
3500 return; | |
3501 } | |
3502 | |
3503 if ($methylation_calls[$index] eq 'X') { | |
3504 $counting{total_meCHG_count}++; | |
3505 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3506 if ($read_identity == 1){ | |
3507 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3508 } | |
3509 else{ | |
3510 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3511 } | |
3512 } | |
3513 elsif ($methylation_calls[$index] eq 'x') { | |
3514 $counting{total_unmethylated_CHG_count}++; | |
3515 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3516 if ($read_identity == 1){ | |
3517 $mbias_1{CHG}->{$index+1}->{un}++; | |
3518 } | |
3519 else{ | |
3520 $mbias_2{CHG}->{$index+1}->{un}++; | |
3521 } | |
3522 } | |
3523 elsif ($methylation_calls[$index] eq 'Z') { | |
3524 $counting{total_meCpG_count}++; | |
3525 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3526 if ($read_identity == 1){ | |
3527 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3528 } | |
3529 else{ | |
3530 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3531 } | |
3532 } | |
3533 elsif ($methylation_calls[$index] eq 'z') { | |
3534 $counting{total_unmethylated_CpG_count}++; | |
3535 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3536 if ($read_identity == 1){ | |
3537 $mbias_1{CpG}->{$index+1}->{un}++; | |
3538 } | |
3539 else{ | |
3540 $mbias_2{CpG}->{$index+1}->{un}++; | |
3541 } | |
3542 } | |
3543 elsif ($methylation_calls[$index] eq 'H') { | |
3544 $counting{total_meCHH_count}++; | |
3545 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3546 if ($read_identity == 1){ | |
3547 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3548 } | |
3549 else{ | |
3550 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3551 } | |
3552 } | |
3553 elsif ($methylation_calls[$index] eq 'h') { | |
3554 $counting{total_unmethylated_CHH_count}++; | |
3555 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3556 if ($read_identity == 1){ | |
3557 $mbias_1{CHH}->{$index+1}->{un}++; | |
3558 } | |
3559 else{ | |
3560 $mbias_2{CHH}->{$index+1}->{un}++; | |
3561 } | |
3562 } | |
3563 elsif ($methylation_calls[$index] eq '.') {} | |
3564 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3565 else{ | |
3566 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3567 } | |
3568 } | |
3569 } else { | |
3570 die "The strand orientation as neither + nor -: '$strand'\n"; | |
3571 } | |
3572 } | |
3573 } | |
3574 | |
3575 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
3576 else { | |
3577 ### single-file CpG, CHG and CHH context output | |
3578 if ($full) { | |
3579 if ($strand eq '+') { | |
3580 for my $index (0..$#methylation_calls) { | |
3581 | |
3582 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3583 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3584 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
3585 $cigar_offset += $cigar_mod; | |
3586 $pos_offset += $pos_mod; | |
3587 } | |
3588 | |
3589 if ($methylation_calls[$index] eq 'X') { | |
3590 $counting{total_meCHG_count}++; | |
3591 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3592 if ($read_identity == 1){ | |
3593 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3594 } | |
3595 else{ | |
3596 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3597 } | |
3598 } | |
3599 elsif ($methylation_calls[$index] eq 'x') { | |
3600 $counting{total_unmethylated_CHG_count}++; | |
3601 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3602 if ($read_identity == 1){ | |
3603 $mbias_1{CHG}->{$index+1}->{un}++; | |
3604 } | |
3605 else{ | |
3606 $mbias_2{CHG}->{$index+1}->{un}++; | |
3607 } | |
3608 } | |
3609 elsif ($methylation_calls[$index] eq 'Z') { | |
3610 $counting{total_meCpG_count}++; | |
3611 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3612 if ($read_identity == 1){ | |
3613 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3614 } | |
3615 else{ | |
3616 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3617 } | |
3618 } | |
3619 elsif ($methylation_calls[$index] eq 'z') { | |
3620 $counting{total_unmethylated_CpG_count}++; | |
3621 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3622 if ($read_identity == 1){ | |
3623 $mbias_1{CpG}->{$index+1}->{un}++; | |
3624 } | |
3625 else{ | |
3626 $mbias_2{CpG}->{$index+1}->{un}++; | |
3627 } | |
3628 } | |
3629 elsif ($methylation_calls[$index] eq 'H') { | |
3630 $counting{total_meCHH_count}++; | |
3631 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3632 if ($read_identity == 1){ | |
3633 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3634 } | |
3635 else{ | |
3636 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3637 } | |
3638 } | |
3639 elsif ($methylation_calls[$index] eq 'h') { | |
3640 $counting{total_unmethylated_CHH_count}++; | |
3641 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3642 if ($read_identity == 1){ | |
3643 $mbias_1{CHH}->{$index+1}->{un}++; | |
3644 } | |
3645 else{ | |
3646 $mbias_2{CHH}->{$index+1}->{un}++; | |
3647 } | |
3648 } | |
3649 elsif ($methylation_calls[$index] eq '.') {} | |
3650 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3651 else{ | |
3652 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3653 } | |
3654 } | |
3655 } elsif ($strand eq '-') { | |
3656 for my $index (0..$#methylation_calls) { | |
3657 | |
3658 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3659 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
3660 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3661 $cigar_offset += $cigar_mod; | |
3662 $pos_offset += $pos_mod; | |
3663 } | |
3664 | |
3665 if ($methylation_calls[$index] eq 'X') { | |
3666 $counting{total_meCHG_count}++; | |
3667 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3668 if ($read_identity == 1){ | |
3669 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3670 } | |
3671 else{ | |
3672 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3673 } | |
3674 } | |
3675 elsif ($methylation_calls[$index] eq 'x') { | |
3676 $counting{total_unmethylated_CHG_count}++; | |
3677 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3678 if ($read_identity == 1){ | |
3679 $mbias_1{CHG}->{$index+1}->{un}++; | |
3680 } | |
3681 else{ | |
3682 $mbias_2{CHG}->{$index+1}->{un}++; | |
3683 } | |
3684 } | |
3685 elsif ($methylation_calls[$index] eq 'Z') { | |
3686 $counting{total_meCpG_count}++; | |
3687 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3688 if ($read_identity == 1){ | |
3689 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3690 } | |
3691 else{ | |
3692 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3693 } | |
3694 } | |
3695 elsif ($methylation_calls[$index] eq 'z') { | |
3696 $counting{total_unmethylated_CpG_count}++; | |
3697 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3698 if ($read_identity == 1){ | |
3699 $mbias_1{CpG}->{$index+1}->{un}++; | |
3700 } | |
3701 else{ | |
3702 $mbias_2{CpG}->{$index+1}->{un}++; | |
3703 } | |
3704 } | |
3705 elsif ($methylation_calls[$index] eq 'H') { | |
3706 $counting{total_meCHH_count}++; | |
3707 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3708 if ($read_identity == 1){ | |
3709 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3710 } | |
3711 else{ | |
3712 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3713 } | |
3714 } | |
3715 elsif ($methylation_calls[$index] eq 'h') { | |
3716 $counting{total_unmethylated_CHH_count}++; | |
3717 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3718 if ($read_identity == 1){ | |
3719 $mbias_1{CHH}->{$index+1}->{un}++; | |
3720 } | |
3721 else{ | |
3722 $mbias_2{CHH}->{$index+1}->{un}++; | |
3723 } | |
3724 } | |
3725 elsif ($methylation_calls[$index] eq '.') {} | |
3726 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3727 else{ | |
3728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3729 } | |
3730 } | |
3731 } else { | |
3732 die "The strand orientation as neither + nor -: '$strand'\n"; | |
3733 } | |
3734 } | |
3735 | |
3736 ### strand-specific methylation output | |
3737 else { | |
3738 if ($strand eq '+') { | |
3739 for my $index (0..$#methylation_calls) { | |
3740 | |
3741 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3742 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3743 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
3744 $cigar_offset += $cigar_mod; | |
3745 $pos_offset += $pos_mod; | |
3746 } | |
3747 | |
3748 if ($methylation_calls[$index] eq 'X') { | |
3749 $counting{total_meCHG_count}++; | |
3750 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3751 if ($read_identity == 1){ | |
3752 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3753 } | |
3754 else{ | |
3755 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3756 } | |
3757 } | |
3758 elsif ($methylation_calls[$index] eq 'x') { | |
3759 $counting{total_unmethylated_CHG_count}++; | |
3760 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3761 if ($read_identity == 1){ | |
3762 $mbias_1{CHG}->{$index+1}->{un}++; | |
3763 } | |
3764 else{ | |
3765 $mbias_2{CHG}->{$index+1}->{un}++; | |
3766 } | |
3767 } | |
3768 elsif ($methylation_calls[$index] eq 'Z') { | |
3769 $counting{total_meCpG_count}++; | |
3770 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3771 if ($read_identity == 1){ | |
3772 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3773 } | |
3774 else{ | |
3775 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3776 } | |
3777 } | |
3778 elsif ($methylation_calls[$index] eq 'z') { | |
3779 $counting{total_unmethylated_CpG_count}++; | |
3780 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3781 if ($read_identity == 1){ | |
3782 $mbias_1{CpG}->{$index+1}->{un}++; | |
3783 } | |
3784 else{ | |
3785 $mbias_2{CpG}->{$index+1}->{un}++; | |
3786 } | |
3787 } | |
3788 elsif ($methylation_calls[$index] eq 'H') { | |
3789 $counting{total_meCHH_count}++; | |
3790 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3791 if ($read_identity == 1){ | |
3792 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3793 } | |
3794 else{ | |
3795 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3796 } | |
3797 } | |
3798 elsif ($methylation_calls[$index] eq 'h') { | |
3799 $counting{total_unmethylated_CHH_count}++; | |
3800 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3801 if ($read_identity == 1){ | |
3802 $mbias_1{CHH}->{$index+1}->{un}++; | |
3803 } | |
3804 else{ | |
3805 $mbias_2{CHH}->{$index+1}->{un}++; | |
3806 } | |
3807 } | |
3808 elsif ($methylation_calls[$index] eq '.') {} | |
3809 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3810 else{ | |
3811 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3812 } | |
3813 } | |
3814 } elsif ($strand eq '-') { | |
3815 for my $index (0..$#methylation_calls) { | |
3816 | |
3817 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3818 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
3819 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3820 $cigar_offset += $cigar_mod; | |
3821 $pos_offset += $pos_mod; | |
3822 } | |
3823 | |
3824 if ($methylation_calls[$index] eq 'X') { | |
3825 $counting{total_meCHG_count}++; | |
3826 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3827 if ($read_identity == 1){ | |
3828 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3829 } | |
3830 else{ | |
3831 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3832 } | |
3833 } | |
3834 elsif ($methylation_calls[$index] eq 'x') { | |
3835 $counting{total_unmethylated_CHG_count}++; | |
3836 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3837 if ($read_identity == 1){ | |
3838 $mbias_1{CHG}->{$index+1}->{un}++; | |
3839 } | |
3840 else{ | |
3841 $mbias_2{CHG}->{$index+1}->{un}++; | |
3842 } | |
3843 } | |
3844 elsif ($methylation_calls[$index] eq 'Z') { | |
3845 $counting{total_meCpG_count}++; | |
3846 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3847 if ($read_identity == 1){ | |
3848 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3849 } | |
3850 else{ | |
3851 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3852 } | |
3853 } | |
3854 elsif ($methylation_calls[$index] eq 'z') { | |
3855 $counting{total_unmethylated_CpG_count}++; | |
3856 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3857 if ($read_identity == 1){ | |
3858 $mbias_1{CpG}->{$index+1}->{un}++; | |
3859 } | |
3860 else{ | |
3861 $mbias_2{CpG}->{$index+1}->{un}++; | |
3862 } | |
3863 } | |
3864 elsif ($methylation_calls[$index] eq 'H') { | |
3865 $counting{total_meCHH_count}++; | |
3866 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3867 if ($read_identity == 1){ | |
3868 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3869 } | |
3870 else{ | |
3871 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3872 } | |
3873 } | |
3874 elsif ($methylation_calls[$index] eq 'h') { | |
3875 $counting{total_unmethylated_CHH_count}++; | |
3876 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
3877 if ($read_identity == 1){ | |
3878 $mbias_1{CHH}->{$index+1}->{un}++; | |
3879 } | |
3880 else{ | |
3881 $mbias_2{CHH}->{$index+1}->{un}++; | |
3882 } | |
3883 } | |
3884 elsif ($methylation_calls[$index] eq '.') {} | |
3885 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3886 else{ | |
3887 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3888 } | |
3889 } | |
3890 } else { | |
3891 die "The strand orientation as neither + nor -: '$strand'\n"; | |
3892 } | |
3893 } | |
3894 } | |
3895 } | |
3896 | |
3897 sub check_cigar_string { | |
3898 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; | |
3899 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; | |
3900 my ($new_cigar_offset,$new_pos_offset) = (0,0); | |
3901 | |
3902 if ($strand eq '+') { | |
3903 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
3904 | |
3905 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
3906 # warn "position needs no adjustment\n"; | |
3907 } | |
3908 | |
3909 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
3910 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
3911 # warn "adjusted genomic position by -1 bp (insertion)\n"; | |
3912 } | |
3913 | |
3914 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
3915 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
3916 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
3917 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
3918 | |
3919 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
3920 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
3921 # warn "position needs no adjustment\n"; | |
3922 last; | |
3923 } | |
3924 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
3925 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
3926 # warn "adjusted genomic position by another -1 bp (insertion)\n"; | |
3927 last; | |
3928 } | |
3929 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
3930 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
3931 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
3932 # warn "adjusted genomic position by another +1 bp (deletion)\n"; | |
3933 } | |
3934 else{ | |
3935 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
3936 } | |
3937 } | |
3938 } | |
3939 else{ | |
3940 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
3941 } | |
3942 } | |
3943 | |
3944 elsif ($strand eq '-') { | |
3945 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
3946 | |
3947 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
3948 # warn "position needs no adjustment\n"; | |
3949 } | |
3950 | |
3951 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
3952 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position | |
3953 # warn "adjusted genomic position by +1 bp (insertion)\n"; | |
3954 } | |
3955 | |
3956 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
3957 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
3958 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
3959 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
3960 | |
3961 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
3962 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
3963 # warn "Found new 'M' operation; position needs no adjustment\n"; | |
3964 last; | |
3965 } | |
3966 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
3967 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position | |
3968 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; | |
3969 last; | |
3970 } | |
3971 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
3972 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
3973 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
3974 # warn "adjusted genomic position by another -1 bp (deletion)\n"; | |
3975 } | |
3976 else{ | |
3977 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
3978 } | |
3979 } | |
3980 } | |
3981 else{ | |
3982 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
3983 } | |
3984 } | |
3985 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; | |
3986 return ($new_cigar_offset,$new_pos_offset); | |
3987 } | |
3988 | |
3989 sub print_individual_C_methylation_states_single_end{ | |
3990 | |
3991 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; | |
1942 my @methylation_calls = split(//,$meth_call); | 3992 my @methylation_calls = split(//,$meth_call); |
1943 | 3993 |
1944 ################################################################# | 3994 ################################################################# |
1945 ### . for bases not involving cytosines ### | 3995 ### . for bases not involving cytosines ### |
1946 ### X for methylated C in CHG context (was protected) ### | 3996 ### X for methylated C in CHG context (was protected) ### |
1956 my $methyl_CpG_count = 0; | 4006 my $methyl_CpG_count = 0; |
1957 my $unmethylated_CHG_count = 0; | 4007 my $unmethylated_CHG_count = 0; |
1958 my $unmethylated_CHH_count = 0; | 4008 my $unmethylated_CHH_count = 0; |
1959 my $unmethylated_CpG_count = 0; | 4009 my $unmethylated_CpG_count = 0; |
1960 | 4010 |
1961 | |
1962 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | 4011 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions |
1963 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | 4012 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels |
4013 | |
1964 my @comp_cigar; | 4014 my @comp_cigar; |
1965 | 4015 |
1966 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing | 4016 if ($cigar){ # parsing CIGAR string |
1967 if ($cigar =~ /^\d+M$/){ | 4017 |
1968 } | 4018 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing |
1969 else{ # parsing CIGAR string | 4019 if ($cigar =~ /^\d+M$/){ |
1970 my @len; | 4020 # warn "See!? I told you so! $cigar\n"; |
1971 my @ops; | 4021 # sleep(1); |
1972 @len = split (/\D+/,$cigar); # storing the length per operation | 4022 } |
1973 @ops = split (/\d+/,$cigar); # storing the operation | 4023 else{ |
1974 shift @ops; # remove the empty first element | 4024 |
1975 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | 4025 my @len; |
1976 | 4026 my @ops; |
1977 foreach my $index (0..$#len){ | 4027 |
1978 foreach (1..$len[$index]){ | 4028 @len = split (/\D+/,$cigar); # storing the length per operation |
1979 # print "$ops[$index]"; | 4029 @ops = split (/\d+/,$cigar); # storing the operation |
1980 push @comp_cigar, $ops[$index]; | 4030 shift @ops; # remove the empty first element |
4031 # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops); | |
4032 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
4033 | |
4034 foreach my $index (0..$#len){ | |
4035 foreach (1..$len[$index]){ | |
4036 # print "$ops[$index]"; | |
4037 push @comp_cigar, $ops[$index]; | |
4038 } | |
1981 } | 4039 } |
1982 } | 4040 } |
1983 # warn "\nDetected CIGAR string: $cigar\n"; | 4041 # warn "\nDetected CIGAR string: $cigar\n"; |
1984 # warn "Length of methylation call: ",length $meth_call,"\n"; | 4042 # warn "Length of methylation call: ",length $meth_call,"\n"; |
1985 # warn "number of operations: ",scalar @ops,"\n"; | 4043 # warn "number of operations: ",scalar @ops,"\n"; |
1987 # print @comp_cigar,"\n"; | 4045 # print @comp_cigar,"\n"; |
1988 # print "$meth_call\n\n"; | 4046 # print "$meth_call\n\n"; |
1989 # sleep (1); | 4047 # sleep (1); |
1990 } | 4048 } |
1991 | 4049 |
1992 if ($strand eq '-') { | |
1993 | |
1994 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above | |
1995 if (@comp_cigar){ | |
1996 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
1997 } | |
1998 # print "reverse CIGAR string: @comp_cigar\n"; | |
1999 | |
2000 ### the start position of paired-end files has already been corrected, see above | |
2001 } | |
2002 | |
2003 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified | |
2004 | |
2005 if ($merge_non_CpG) { | |
2006 | |
2007 if ($no_overlap) { | |
2008 | |
2009 ### single-file CpG and non-CpG context output | |
2010 if ($full) { | |
2011 if ($strand eq '+') { | |
2012 for my $index (0..$#methylation_calls) { | |
2013 | |
2014 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2015 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2016 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2017 $cigar_offset += $cigar_mod; | |
2018 $pos_offset += $pos_mod; | |
2019 } | |
2020 | |
2021 ### Returning as soon as the methylation calls start overlapping | |
2022 if ($start+$index+$pos_offset >= $end_read_1) { | |
2023 return; | |
2024 } | |
2025 | |
2026 if ($methylation_calls[$index] eq 'X') { | |
2027 $counting{total_meCHG_count}++; | |
2028 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2029 } elsif ($methylation_calls[$index] eq 'x') { | |
2030 $counting{total_unmethylated_CHG_count}++; | |
2031 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2032 } elsif ($methylation_calls[$index] eq 'Z') { | |
2033 $counting{total_meCpG_count}++; | |
2034 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2035 } elsif ($methylation_calls[$index] eq 'z') { | |
2036 $counting{total_unmethylated_CpG_count}++; | |
2037 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2038 } elsif ($methylation_calls[$index] eq 'H') { | |
2039 $counting{total_meCHH_count}++; | |
2040 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2041 } elsif ($methylation_calls[$index] eq 'h') { | |
2042 $counting{total_unmethylated_CHH_count}++; | |
2043 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2044 } | |
2045 elsif ($methylation_calls[$index] eq '.'){} | |
2046 else{ | |
2047 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2048 } | |
2049 } | |
2050 } elsif ($strand eq '-') { | |
2051 for my $index (0..$#methylation_calls) { | |
2052 | |
2053 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2054 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2055 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2056 $cigar_offset += $cigar_mod; | |
2057 $pos_offset += $pos_mod; | |
2058 } | |
2059 | |
2060 ### Returning as soon as the methylation calls start overlapping | |
2061 if ($start-$index+$pos_offset <= $end_read_1) { | |
2062 return; | |
2063 } | |
2064 | |
2065 if ($methylation_calls[$index] eq 'X') { | |
2066 $counting{total_meCHG_count}++; | |
2067 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2068 } elsif ($methylation_calls[$index] eq 'x') { | |
2069 $counting{total_unmethylated_CHG_count}++; | |
2070 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2071 } elsif ($methylation_calls[$index] eq 'Z') { | |
2072 $counting{total_meCpG_count}++; | |
2073 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2074 } elsif ($methylation_calls[$index] eq 'z') { | |
2075 $counting{total_unmethylated_CpG_count}++; | |
2076 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2077 } elsif ($methylation_calls[$index] eq 'H') { | |
2078 $counting{total_meCHH_count}++; | |
2079 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2080 } elsif ($methylation_calls[$index] eq 'h') { | |
2081 $counting{total_unmethylated_CHH_count}++; | |
2082 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2083 } | |
2084 elsif ($methylation_calls[$index] eq '.') {} | |
2085 else{ | |
2086 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2087 } | |
2088 } | |
2089 } else { | |
2090 die "The read orientation was neither + nor -: '$strand'\n"; | |
2091 } | |
2092 } | |
2093 | |
2094 ### strand-specific methylation output | |
2095 else { | |
2096 if ($strand eq '+') { | |
2097 for my $index (0..$#methylation_calls) { | |
2098 | |
2099 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2100 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2101 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2102 $cigar_offset += $cigar_mod; | |
2103 $pos_offset += $pos_mod; | |
2104 } | |
2105 | |
2106 ### Returning as soon as the methylation calls start overlapping | |
2107 if ($start+$index+$pos_offset >= $end_read_1) { | |
2108 return; | |
2109 } | |
2110 | |
2111 if ($methylation_calls[$index] eq 'X') { | |
2112 $counting{total_meCHG_count}++; | |
2113 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2114 } elsif ($methylation_calls[$index] eq 'x') { | |
2115 $counting{total_unmethylated_CHG_count}++; | |
2116 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2117 } elsif ($methylation_calls[$index] eq 'Z') { | |
2118 $counting{total_meCpG_count}++; | |
2119 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2120 } elsif ($methylation_calls[$index] eq 'z') { | |
2121 $counting{total_unmethylated_CpG_count}++; | |
2122 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2123 } elsif ($methylation_calls[$index] eq 'H') { | |
2124 $counting{total_meCHH_count}++; | |
2125 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2126 } elsif ($methylation_calls[$index] eq 'h') { | |
2127 $counting{total_unmethylated_CHH_count}++; | |
2128 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2129 } | |
2130 elsif ($methylation_calls[$index] eq '.') {} | |
2131 else{ | |
2132 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2133 } | |
2134 } | |
2135 } elsif ($strand eq '-') { | |
2136 for my $index (0..$#methylation_calls) { | |
2137 | |
2138 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2139 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2140 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2141 $cigar_offset += $cigar_mod; | |
2142 $pos_offset += $pos_mod; | |
2143 } | |
2144 | |
2145 ### Returning as soon as the methylation calls start overlapping | |
2146 if ($start-$index+$pos_offset <= $end_read_1) { | |
2147 return; | |
2148 } | |
2149 | |
2150 if ($methylation_calls[$index] eq 'X') { | |
2151 $counting{total_meCHG_count}++; | |
2152 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2153 } elsif ($methylation_calls[$index] eq 'x') { | |
2154 $counting{total_unmethylated_CHG_count}++; | |
2155 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2156 } elsif ($methylation_calls[$index] eq 'Z') { | |
2157 $counting{total_meCpG_count}++; | |
2158 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2159 } elsif ($methylation_calls[$index] eq 'z') { | |
2160 $counting{total_unmethylated_CpG_count}++; | |
2161 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2162 } elsif ($methylation_calls[$index] eq 'H') { | |
2163 $counting{total_meCHH_count}++; | |
2164 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2165 } elsif ($methylation_calls[$index] eq 'h') { | |
2166 $counting{total_unmethylated_CHH_count}++; | |
2167 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2168 } | |
2169 elsif ($methylation_calls[$index] eq '.') {} | |
2170 else{ | |
2171 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2172 } | |
2173 } | |
2174 } else { | |
2175 die "The strand orientation was neither + nor -: '$strand'/n"; | |
2176 } | |
2177 } | |
2178 } | |
2179 | |
2180 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
2181 ### Still within the 2-CONTEXT ONLY optional section | |
2182 else { | |
2183 ### single-file CpG and non-CpG context output | |
2184 if ($full) { | |
2185 if ($strand eq '+') { | |
2186 for my $index (0..$#methylation_calls) { | |
2187 | |
2188 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2189 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2190 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2191 $cigar_offset += $cigar_mod; | |
2192 $pos_offset += $pos_mod; | |
2193 } | |
2194 | |
2195 if ($methylation_calls[$index] eq 'X') { | |
2196 $counting{total_meCHG_count}++; | |
2197 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2198 } elsif ($methylation_calls[$index] eq 'x') { | |
2199 $counting{total_unmethylated_CHG_count}++; | |
2200 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2201 } elsif ($methylation_calls[$index] eq 'Z') { | |
2202 $counting{total_meCpG_count}++; | |
2203 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2204 } elsif ($methylation_calls[$index] eq 'z') { | |
2205 $counting{total_unmethylated_CpG_count}++; | |
2206 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2207 } elsif ($methylation_calls[$index] eq 'H') { | |
2208 $counting{total_meCHH_count}++; | |
2209 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2210 } elsif ($methylation_calls[$index] eq 'h') { | |
2211 $counting{total_unmethylated_CHH_count}++; | |
2212 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2213 } | |
2214 elsif ($methylation_calls[$index] eq '.') {} | |
2215 else{ | |
2216 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2217 } | |
2218 } | |
2219 } elsif ($strand eq '-') { | |
2220 for my $index (0..$#methylation_calls) { | |
2221 | |
2222 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2223 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2224 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2225 $cigar_offset += $cigar_mod; | |
2226 $pos_offset += $pos_mod; | |
2227 } | |
2228 | |
2229 if ($methylation_calls[$index] eq 'X') { | |
2230 $counting{total_meCHG_count}++; | |
2231 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2232 } elsif ($methylation_calls[$index] eq 'x') { | |
2233 $counting{total_unmethylated_CHG_count}++; | |
2234 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2235 } elsif ($methylation_calls[$index] eq 'Z') { | |
2236 $counting{total_meCpG_count}++; | |
2237 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2238 } elsif ($methylation_calls[$index] eq 'z') { | |
2239 $counting{total_unmethylated_CpG_count}++; | |
2240 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2241 } elsif ($methylation_calls[$index] eq 'H') { | |
2242 $counting{total_meCHH_count}++; | |
2243 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2244 } elsif ($methylation_calls[$index] eq 'h') { | |
2245 $counting{total_unmethylated_CHH_count}++; | |
2246 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2247 } | |
2248 elsif ($methylation_calls[$index] eq '.') {} | |
2249 else{ | |
2250 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2251 } | |
2252 } | |
2253 } else { | |
2254 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2255 } | |
2256 } | |
2257 | |
2258 ### strand-specific methylation output | |
2259 ### still within the 2-CONTEXT optional section | |
2260 else { | |
2261 if ($strand eq '+') { | |
2262 for my $index (0..$#methylation_calls) { | |
2263 | |
2264 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2265 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2266 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2267 $cigar_offset += $cigar_mod; | |
2268 $pos_offset += $pos_mod; | |
2269 } | |
2270 | |
2271 if ($methylation_calls[$index] eq 'X') { | |
2272 $counting{total_meCHG_count}++; | |
2273 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2274 } elsif ($methylation_calls[$index] eq 'x') { | |
2275 $counting{total_unmethylated_CHG_count}++; | |
2276 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2277 } elsif ($methylation_calls[$index] eq 'Z') { | |
2278 $counting{total_meCpG_count}++; | |
2279 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2280 } elsif ($methylation_calls[$index] eq 'z') { | |
2281 $counting{total_unmethylated_CpG_count}++; | |
2282 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2283 } elsif ($methylation_calls[$index] eq 'H') { | |
2284 $counting{total_meCHH_count}++; | |
2285 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2286 } elsif ($methylation_calls[$index] eq 'h') { | |
2287 $counting{total_unmethylated_CHH_count}++; | |
2288 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2289 } | |
2290 elsif ($methylation_calls[$index] eq '.') {} | |
2291 else{ | |
2292 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2293 } | |
2294 } | |
2295 } elsif ($strand eq '-') { | |
2296 for my $index (0..$#methylation_calls) { | |
2297 | |
2298 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2299 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2300 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2301 $cigar_offset += $cigar_mod; | |
2302 $pos_offset += $pos_mod; | |
2303 } | |
2304 | |
2305 if ($methylation_calls[$index] eq 'X') { | |
2306 $counting{total_meCHG_count}++; | |
2307 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2308 } elsif ($methylation_calls[$index] eq 'x') { | |
2309 $counting{total_unmethylated_CHG_count}++; | |
2310 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2311 } elsif ($methylation_calls[$index] eq 'Z') { | |
2312 $counting{total_meCpG_count}++; | |
2313 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2314 } elsif ($methylation_calls[$index] eq 'z') { | |
2315 $counting{total_unmethylated_CpG_count}++; | |
2316 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2317 } elsif ($methylation_calls[$index] eq 'H') { | |
2318 $counting{total_meCHH_count}++; | |
2319 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2320 } elsif ($methylation_calls[$index] eq 'h') { | |
2321 $counting{total_unmethylated_CHH_count}++; | |
2322 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2323 } | |
2324 elsif ($methylation_calls[$index] eq '.') {} | |
2325 else{ | |
2326 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2327 } | |
2328 } | |
2329 } else { | |
2330 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2331 } | |
2332 } | |
2333 } | |
2334 } | |
2335 | |
2336 ############################################ | |
2337 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### | |
2338 ############################################ | |
2339 | |
2340 elsif ($no_overlap) { | |
2341 ### single-file CpG, CHG and CHH context output | |
2342 if ($full) { | |
2343 if ($strand eq '+') { | |
2344 for my $index (0..$#methylation_calls) { | |
2345 | |
2346 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2347 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2348 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2349 $cigar_offset += $cigar_mod; | |
2350 $pos_offset += $pos_mod; | |
2351 } | |
2352 | |
2353 ### Returning as soon as the methylation calls start overlapping | |
2354 if ($start+$index+$pos_offset >= $end_read_1) { | |
2355 return; | |
2356 } | |
2357 | |
2358 if ($methylation_calls[$index] eq 'X') { | |
2359 $counting{total_meCHG_count}++; | |
2360 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2361 } elsif ($methylation_calls[$index] eq 'x') { | |
2362 $counting{total_unmethylated_CHG_count}++; | |
2363 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2364 } elsif ($methylation_calls[$index] eq 'Z') { | |
2365 $counting{total_meCpG_count}++; | |
2366 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2367 } elsif ($methylation_calls[$index] eq 'z') { | |
2368 $counting{total_unmethylated_CpG_count}++; | |
2369 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2370 } elsif ($methylation_calls[$index] eq 'H') { | |
2371 $counting{total_meCHH_count}++; | |
2372 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2373 } elsif ($methylation_calls[$index] eq 'h') { | |
2374 $counting{total_unmethylated_CHH_count}++; | |
2375 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2376 } | |
2377 elsif ($methylation_calls[$index] eq '.') {} | |
2378 else{ | |
2379 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2380 } | |
2381 } | |
2382 } elsif ($strand eq '-') { | |
2383 for my $index (0..$#methylation_calls) { | |
2384 | |
2385 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2386 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2387 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2388 $cigar_offset += $cigar_mod; | |
2389 $pos_offset += $pos_mod; | |
2390 } | |
2391 | |
2392 ### Returning as soon as the methylation calls start overlapping | |
2393 if ($start-$index+$pos_offset <= $end_read_1) { | |
2394 return; | |
2395 } | |
2396 | |
2397 if ($methylation_calls[$index] eq 'X') { | |
2398 $counting{total_meCHG_count}++; | |
2399 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2400 } elsif ($methylation_calls[$index] eq 'x') { | |
2401 $counting{total_unmethylated_CHG_count}++; | |
2402 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2403 } elsif ($methylation_calls[$index] eq 'Z') { | |
2404 $counting{total_meCpG_count}++; | |
2405 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2406 } elsif ($methylation_calls[$index] eq 'z') { | |
2407 $counting{total_unmethylated_CpG_count}++; | |
2408 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2409 } elsif ($methylation_calls[$index] eq 'H') { | |
2410 $counting{total_meCHH_count}++; | |
2411 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2412 } elsif ($methylation_calls[$index] eq 'h') { | |
2413 $counting{total_unmethylated_CHH_count}++; | |
2414 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2415 } | |
2416 elsif ($methylation_calls[$index] eq '.') {} | |
2417 else{ | |
2418 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2419 } | |
2420 } | |
2421 } else { | |
2422 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2423 } | |
2424 } | |
2425 | |
2426 ### strand-specific methylation output | |
2427 else { | |
2428 if ($strand eq '+') { | |
2429 for my $index (0..$#methylation_calls) { | |
2430 | |
2431 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2432 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2433 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2434 $cigar_offset += $cigar_mod; | |
2435 $pos_offset += $pos_mod; | |
2436 } | |
2437 | |
2438 ### Returning as soon as the methylation calls start overlapping | |
2439 if ($start+$index+$pos_offset >= $end_read_1) { | |
2440 return; | |
2441 } | |
2442 | |
2443 if ($methylation_calls[$index] eq 'X') { | |
2444 $counting{total_meCHG_count}++; | |
2445 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2446 } elsif ($methylation_calls[$index] eq 'x') { | |
2447 $counting{total_unmethylated_CHG_count}++; | |
2448 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2449 } elsif ($methylation_calls[$index] eq 'Z') { | |
2450 $counting{total_meCpG_count}++; | |
2451 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2452 } elsif ($methylation_calls[$index] eq 'z') { | |
2453 $counting{total_unmethylated_CpG_count}++; | |
2454 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2455 } elsif ($methylation_calls[$index] eq 'H') { | |
2456 $counting{total_meCHH_count}++; | |
2457 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2458 } elsif ($methylation_calls[$index] eq 'h') { | |
2459 $counting{total_unmethylated_CHH_count}++; | |
2460 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2461 } | |
2462 elsif ($methylation_calls[$index] eq '.') {} | |
2463 else{ | |
2464 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2465 } | |
2466 } | |
2467 } elsif ($strand eq '-') { | |
2468 for my $index (0..$#methylation_calls) { | |
2469 | |
2470 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2471 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2472 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2473 $cigar_offset += $cigar_mod; | |
2474 $pos_offset += $pos_mod; | |
2475 } | |
2476 | |
2477 ### Returning as soon as the methylation calls start overlapping | |
2478 if ($start-$index+$pos_offset <= $end_read_1) { | |
2479 return; | |
2480 } | |
2481 | |
2482 if ($methylation_calls[$index] eq 'X') { | |
2483 $counting{total_meCHG_count}++; | |
2484 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2485 } elsif ($methylation_calls[$index] eq 'x') { | |
2486 $counting{total_unmethylated_CHG_count}++; | |
2487 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2488 } elsif ($methylation_calls[$index] eq 'Z') { | |
2489 $counting{total_meCpG_count}++; | |
2490 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2491 } elsif ($methylation_calls[$index] eq 'z') { | |
2492 $counting{total_unmethylated_CpG_count}++; | |
2493 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2494 } elsif ($methylation_calls[$index] eq 'H') { | |
2495 $counting{total_meCHH_count}++; | |
2496 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2497 } elsif ($methylation_calls[$index] eq 'h') { | |
2498 $counting{total_unmethylated_CHH_count}++; | |
2499 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2500 } | |
2501 elsif ($methylation_calls[$index] eq '.') {} | |
2502 else{ | |
2503 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2504 } | |
2505 } | |
2506 } else { | |
2507 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2508 } | |
2509 } | |
2510 } | |
2511 | |
2512 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
2513 else { | |
2514 ### single-file CpG, CHG and CHH context output | |
2515 if ($full) { | |
2516 if ($strand eq '+') { | |
2517 for my $index (0..$#methylation_calls) { | |
2518 | |
2519 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2520 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2521 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2522 $cigar_offset += $cigar_mod; | |
2523 $pos_offset += $pos_mod; | |
2524 } | |
2525 | |
2526 if ($methylation_calls[$index] eq 'X') { | |
2527 $counting{total_meCHG_count}++; | |
2528 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2529 } elsif ($methylation_calls[$index] eq 'x') { | |
2530 $counting{total_unmethylated_CHG_count}++; | |
2531 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2532 } elsif ($methylation_calls[$index] eq 'Z') { | |
2533 $counting{total_meCpG_count}++; | |
2534 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2535 } elsif ($methylation_calls[$index] eq 'z') { | |
2536 $counting{total_unmethylated_CpG_count}++; | |
2537 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2538 } elsif ($methylation_calls[$index] eq 'H') { | |
2539 $counting{total_meCHH_count}++; | |
2540 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2541 } elsif ($methylation_calls[$index] eq 'h') { | |
2542 $counting{total_unmethylated_CHH_count}++; | |
2543 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2544 } | |
2545 elsif ($methylation_calls[$index] eq '.') {} | |
2546 else{ | |
2547 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2548 } | |
2549 } | |
2550 } elsif ($strand eq '-') { | |
2551 for my $index (0..$#methylation_calls) { | |
2552 | |
2553 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2554 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2555 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2556 $cigar_offset += $cigar_mod; | |
2557 $pos_offset += $pos_mod; | |
2558 } | |
2559 | |
2560 if ($methylation_calls[$index] eq 'X') { | |
2561 $counting{total_meCHG_count}++; | |
2562 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2563 } elsif ($methylation_calls[$index] eq 'x') { | |
2564 $counting{total_unmethylated_CHG_count}++; | |
2565 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2566 } elsif ($methylation_calls[$index] eq 'Z') { | |
2567 $counting{total_meCpG_count}++; | |
2568 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2569 } elsif ($methylation_calls[$index] eq 'z') { | |
2570 $counting{total_unmethylated_CpG_count}++; | |
2571 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2572 } elsif ($methylation_calls[$index] eq 'H') { | |
2573 $counting{total_meCHH_count}++; | |
2574 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2575 } elsif ($methylation_calls[$index] eq 'h') { | |
2576 $counting{total_unmethylated_CHH_count}++; | |
2577 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2578 } | |
2579 elsif ($methylation_calls[$index] eq '.') {} | |
2580 else{ | |
2581 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2582 } | |
2583 } | |
2584 } else { | |
2585 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2586 } | |
2587 } | |
2588 | |
2589 ### strand-specific methylation output | |
2590 else { | |
2591 if ($strand eq '+') { | |
2592 for my $index (0..$#methylation_calls) { | |
2593 | |
2594 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2595 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2596 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2597 $cigar_offset += $cigar_mod; | |
2598 $pos_offset += $pos_mod; | |
2599 } | |
2600 | |
2601 if ($methylation_calls[$index] eq 'X') { | |
2602 $counting{total_meCHG_count}++; | |
2603 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2604 } elsif ($methylation_calls[$index] eq 'x') { | |
2605 $counting{total_unmethylated_CHG_count}++; | |
2606 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2607 } elsif ($methylation_calls[$index] eq 'Z') { | |
2608 $counting{total_meCpG_count}++; | |
2609 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2610 } elsif ($methylation_calls[$index] eq 'z') { | |
2611 $counting{total_unmethylated_CpG_count}++; | |
2612 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2613 } elsif ($methylation_calls[$index] eq 'H') { | |
2614 $counting{total_meCHH_count}++; | |
2615 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2616 } elsif ($methylation_calls[$index] eq 'h') { | |
2617 $counting{total_unmethylated_CHH_count}++; | |
2618 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2619 } | |
2620 elsif ($methylation_calls[$index] eq '.') {} | |
2621 else{ | |
2622 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2623 } | |
2624 } | |
2625 } elsif ($strand eq '-') { | |
2626 for my $index (0..$#methylation_calls) { | |
2627 | |
2628 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2629 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
2630 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2631 $cigar_offset += $cigar_mod; | |
2632 $pos_offset += $pos_mod; | |
2633 } | |
2634 | |
2635 if ($methylation_calls[$index] eq 'X') { | |
2636 $counting{total_meCHG_count}++; | |
2637 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2638 } elsif ($methylation_calls[$index] eq 'x') { | |
2639 $counting{total_unmethylated_CHG_count}++; | |
2640 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2641 } elsif ($methylation_calls[$index] eq 'Z') { | |
2642 $counting{total_meCpG_count}++; | |
2643 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2644 } elsif ($methylation_calls[$index] eq 'z') { | |
2645 $counting{total_unmethylated_CpG_count}++; | |
2646 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2647 } elsif ($methylation_calls[$index] eq 'H') { | |
2648 $counting{total_meCHH_count}++; | |
2649 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2650 } elsif ($methylation_calls[$index] eq 'h') { | |
2651 $counting{total_unmethylated_CHH_count}++; | |
2652 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | |
2653 } | |
2654 elsif ($methylation_calls[$index] eq '.') {} | |
2655 else{ | |
2656 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
2657 } | |
2658 } | |
2659 } else { | |
2660 die "The strand orientation as neither + nor -: '$strand'\n"; | |
2661 } | |
2662 } | |
2663 } | |
2664 } | |
2665 | |
2666 sub check_cigar_string { | |
2667 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; | |
2668 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; | |
2669 my ($new_cigar_offset,$new_pos_offset) = (0,0); | |
2670 | |
2671 if ($strand eq '+') { | |
2672 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
2673 | |
2674 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2675 # warn "position needs no adjustment\n"; | |
2676 } | |
2677 | |
2678 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
2679 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
2680 # warn "adjusted genomic position by -1 bp (insertion)\n"; | |
2681 } | |
2682 | |
2683 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2684 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2685 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
2686 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
2687 | |
2688 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
2689 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2690 # warn "position needs no adjustment\n"; | |
2691 last; | |
2692 } | |
2693 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
2694 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
2695 # warn "adjusted genomic position by another -1 bp (insertion)\n"; | |
2696 last; | |
2697 } | |
2698 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2699 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2700 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
2701 # warn "adjusted genomic position by another +1 bp (deletion)\n"; | |
2702 } | |
2703 else{ | |
2704 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2705 } | |
2706 } | |
2707 } | |
2708 else{ | |
2709 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2710 } | |
2711 } | |
2712 | |
2713 elsif ($strand eq '-') { | |
2714 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
2715 | |
2716 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2717 # warn "position needs no adjustment\n"; | |
2718 } | |
2719 | |
2720 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
2721 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position | |
2722 # warn "adjusted genomic position by +1 bp (insertion)\n"; | |
2723 } | |
2724 | |
2725 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2726 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2727 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
2728 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
2729 | |
2730 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
2731 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
2732 # warn "Found new 'M' operation; position needs no adjustment\n"; | |
2733 last; | |
2734 } | |
2735 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
2736 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position | |
2737 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; | |
2738 last; | |
2739 } | |
2740 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
2741 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
2742 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
2743 # warn "adjusted genomic position by another -1 bp (deletion)\n"; | |
2744 } | |
2745 else{ | |
2746 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2747 } | |
2748 } | |
2749 } | |
2750 else{ | |
2751 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
2752 } | |
2753 } | |
2754 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; | |
2755 return ($new_cigar_offset,$new_pos_offset); | |
2756 } | |
2757 | |
2758 sub print_individual_C_methylation_states_single_end{ | |
2759 | |
2760 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; | |
2761 my @methylation_calls = split(//,$meth_call); | |
2762 | |
2763 ################################################################# | |
2764 ### . for bases not involving cytosines ### | |
2765 ### X for methylated C in CHG context (was protected) ### | |
2766 ### x for not methylated C in CHG context (was converted) ### | |
2767 ### H for methylated C in CHH context (was protected) ### | |
2768 ### h for not methylated C in CHH context (was converted) ### | |
2769 ### Z for methylated C in CpG context (was protected) ### | |
2770 ### z for not methylated C in CpG context (was converted) ### | |
2771 ################################################################# | |
2772 | |
2773 my $methyl_CHG_count = 0; | |
2774 my $methyl_CHH_count = 0; | |
2775 my $methyl_CpG_count = 0; | |
2776 my $unmethylated_CHG_count = 0; | |
2777 my $unmethylated_CHH_count = 0; | |
2778 my $unmethylated_CpG_count = 0; | |
2779 | |
2780 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | |
2781 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | |
2782 | |
2783 my @comp_cigar; | |
2784 | |
2785 if ($cigar){ # parsing CIGAR string | |
2786 | |
2787 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing | |
2788 if ($cigar =~ /^\d+M$/){ | |
2789 # warn "See!? I told you so! $cigar\n"; | |
2790 # sleep(1); | |
2791 } | |
2792 else{ | |
2793 | |
2794 my @len; | |
2795 my @ops; | |
2796 | |
2797 @len = split (/\D+/,$cigar); # storing the length per operation | |
2798 @ops = split (/\d+/,$cigar); # storing the operation | |
2799 shift @ops; # remove the empty first element | |
2800 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
2801 | |
2802 foreach my $index (0..$#len){ | |
2803 foreach (1..$len[$index]){ | |
2804 # print "$ops[$index]"; | |
2805 push @comp_cigar, $ops[$index]; | |
2806 } | |
2807 } | |
2808 } | |
2809 # warn "\nDetected CIGAR string: $cigar\n"; | |
2810 # warn "Length of methylation call: ",length $meth_call,"\n"; | |
2811 # warn "number of operations: ",scalar @ops,"\n"; | |
2812 # warn "number of length digits: ",scalar @len,"\n\n"; | |
2813 # print @comp_cigar,"\n"; | |
2814 # print "$meth_call\n\n"; | |
2815 # sleep (1); | |
2816 } | |
2817 | |
2818 ### adjusting the start position for all reads mapping to the reverse strand | 4050 ### adjusting the start position for all reads mapping to the reverse strand |
2819 if ($strand eq '-') { | 4051 if ($strand eq '-') { |
2820 | 4052 |
2821 if (@comp_cigar){ # only needed for SAM reads with InDels | 4053 if (@comp_cigar){ # only needed for SAM reads with InDels |
2822 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | 4054 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too |
2823 # print @comp_cigar,"\n"; | 4055 # print @comp_cigar,"\n"; |
2824 } | 4056 } |
2825 | 4057 |
2826 unless ($ignore){ ### if --ignore was specified the start position has already been corrected | 4058 unless ($ignore){ ### if --ignore was specified the start position has already been corrected |
2827 | 4059 |
2828 if ($cigar){ ### SAM format | 4060 if ($cigar){ ### SAM format |
2829 if ($cigar =~ /^(\d+)M$/){ # linear match | 4061 if ($cigar =~ /^(\d+)M$/){ # linear match |
2830 $start += $1 - 1; | 4062 $start += $1 - 1; |
2831 } | 4063 } |
2832 else{ # InDel read | 4064 else{ # InDel read |
2859 | 4091 |
2860 ### methylated Cs (any context) will receive a forward (+) orientation | 4092 ### methylated Cs (any context) will receive a forward (+) orientation |
2861 ### not methylated Cs (any context) will receive a reverse (-) orientation | 4093 ### not methylated Cs (any context) will receive a reverse (-) orientation |
2862 if ($methylation_calls[$index] eq 'X') { | 4094 if ($methylation_calls[$index] eq 'X') { |
2863 $counting{total_meCHG_count}++; | 4095 $counting{total_meCHG_count}++; |
2864 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4096 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4097 $mbias_1{CHG}->{$index+1}->{meth}++; | |
2865 } | 4098 } |
2866 elsif ($methylation_calls[$index] eq 'x') { | 4099 elsif ($methylation_calls[$index] eq 'x') { |
2867 $counting{total_unmethylated_CHG_count}++; | 4100 $counting{total_unmethylated_CHG_count}++; |
2868 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4101 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4102 $mbias_1{CHG}->{$index+1}->{un}++; | |
2869 } | 4103 } |
2870 elsif ($methylation_calls[$index] eq 'Z') { | 4104 elsif ($methylation_calls[$index] eq 'Z') { |
2871 $counting{total_meCpG_count}++; | 4105 $counting{total_meCpG_count}++; |
2872 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4106 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4107 $mbias_1{CpG}->{$index+1}->{meth}++; | |
2873 } | 4108 } |
2874 elsif ($methylation_calls[$index] eq 'z') { | 4109 elsif ($methylation_calls[$index] eq 'z') { |
2875 $counting{total_unmethylated_CpG_count}++; | 4110 $counting{total_unmethylated_CpG_count}++; |
2876 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4111 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4112 $mbias_1{CpG}->{$index+1}->{un}++; | |
2877 } | 4113 } |
2878 elsif ($methylation_calls[$index] eq 'H') { | 4114 elsif ($methylation_calls[$index] eq 'H') { |
2879 $counting{total_meCHH_count}++; | 4115 $counting{total_meCHH_count}++; |
2880 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4116 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4117 $mbias_1{CHH}->{$index+1}->{meth}++; | |
2881 } | 4118 } |
2882 elsif ($methylation_calls[$index] eq 'h') { | 4119 elsif ($methylation_calls[$index] eq 'h') { |
2883 $counting{total_unmethylated_CHH_count}++; | 4120 $counting{total_unmethylated_CHH_count}++; |
2884 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4121 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
2885 } | 4122 $mbias_1{CHH}->{$index+1}->{un}++; |
2886 elsif ($methylation_calls[$index] eq '.') { | 4123 } |
2887 } | 4124 elsif ($methylation_calls[$index] eq '.') {} |
4125 elsif (lc$methylation_calls[$index] eq 'u'){} | |
2888 else{ | 4126 else{ |
2889 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | 4127 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; |
2890 } | 4128 } |
2891 } | 4129 } |
2892 } | 4130 } |
2903 $pos_offset += $pos_mod; | 4141 $pos_offset += $pos_mod; |
2904 } | 4142 } |
2905 | 4143 |
2906 if ($methylation_calls[$index] eq 'X') { | 4144 if ($methylation_calls[$index] eq 'X') { |
2907 $counting{total_meCHG_count}++; | 4145 $counting{total_meCHG_count}++; |
2908 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4146 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4147 $mbias_1{CHG}->{$index+1}->{meth}++; | |
2909 } | 4148 } |
2910 elsif ($methylation_calls[$index] eq 'x') { | 4149 elsif ($methylation_calls[$index] eq 'x') { |
2911 $counting{total_unmethylated_CHG_count}++; | 4150 $counting{total_unmethylated_CHG_count}++; |
2912 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4151 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4152 $mbias_1{CHG}->{$index+1}->{un}++; | |
2913 } | 4153 } |
2914 elsif ($methylation_calls[$index] eq 'Z') { | 4154 elsif ($methylation_calls[$index] eq 'Z') { |
2915 $counting{total_meCpG_count}++; | 4155 $counting{total_meCpG_count}++; |
2916 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4156 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4157 $mbias_1{CpG}->{$index+1}->{meth}++; | |
2917 } | 4158 } |
2918 elsif ($methylation_calls[$index] eq 'z') { | 4159 elsif ($methylation_calls[$index] eq 'z') { |
2919 $counting{total_unmethylated_CpG_count}++; | 4160 $counting{total_unmethylated_CpG_count}++; |
2920 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4161 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4162 $mbias_1{CpG}->{$index+1}->{un}++; | |
2921 } | 4163 } |
2922 elsif ($methylation_calls[$index] eq 'H') { | 4164 elsif ($methylation_calls[$index] eq 'H') { |
2923 $counting{total_meCHH_count}++; | 4165 $counting{total_meCHH_count}++; |
2924 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4166 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4167 $mbias_1{CHH}->{$index+1}->{meth}++; | |
2925 } | 4168 } |
2926 elsif ($methylation_calls[$index] eq 'h') { | 4169 elsif ($methylation_calls[$index] eq 'h') { |
2927 $counting{total_unmethylated_CHH_count}++; | 4170 $counting{total_unmethylated_CHH_count}++; |
2928 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4171 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
2929 } | 4172 $mbias_1{CHH}->{$index+1}->{un}++; |
2930 elsif ($methylation_calls[$index] eq '.'){ | 4173 } |
2931 } | 4174 elsif ($methylation_calls[$index] eq '.'){} |
4175 elsif (lc$methylation_calls[$index] eq 'u'){} | |
2932 else{ | 4176 else{ |
2933 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | 4177 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; |
2934 } | 4178 } |
2935 } | 4179 } |
2936 } | 4180 } |
2952 $pos_offset += $pos_mod; | 4196 $pos_offset += $pos_mod; |
2953 } | 4197 } |
2954 | 4198 |
2955 if ($methylation_calls[$index] eq 'X') { | 4199 if ($methylation_calls[$index] eq 'X') { |
2956 $counting{total_meCHG_count}++; | 4200 $counting{total_meCHG_count}++; |
2957 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4201 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4202 $mbias_1{CHG}->{$index+1}->{meth}++; | |
2958 } | 4203 } |
2959 elsif ($methylation_calls[$index] eq 'x') { | 4204 elsif ($methylation_calls[$index] eq 'x') { |
2960 $counting{total_unmethylated_CHG_count}++; | 4205 $counting{total_unmethylated_CHG_count}++; |
2961 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4206 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4207 $mbias_1{CHG}->{$index+1}->{un}++; | |
2962 } | 4208 } |
2963 elsif ($methylation_calls[$index] eq 'Z') { | 4209 elsif ($methylation_calls[$index] eq 'Z') { |
2964 $counting{total_meCpG_count}++; | 4210 $counting{total_meCpG_count}++; |
2965 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4211 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4212 $mbias_1{CpG}->{$index+1}->{meth}++; | |
2966 } | 4213 } |
2967 elsif ($methylation_calls[$index] eq 'z') { | 4214 elsif ($methylation_calls[$index] eq 'z') { |
2968 $counting{total_unmethylated_CpG_count}++; | 4215 $counting{total_unmethylated_CpG_count}++; |
2969 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4216 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4217 $mbias_1{CpG}->{$index+1}->{un}++; | |
2970 } | 4218 } |
2971 elsif ($methylation_calls[$index] eq 'H') { | 4219 elsif ($methylation_calls[$index] eq 'H') { |
2972 $counting{total_meCHH_count}++; | 4220 $counting{total_meCHH_count}++; |
2973 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4221 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4222 $mbias_1{CHH}->{$index+1}->{meth}++; | |
2974 } | 4223 } |
2975 elsif ($methylation_calls[$index] eq 'h') { | 4224 elsif ($methylation_calls[$index] eq 'h') { |
2976 $counting{total_unmethylated_CHH_count}++; | 4225 $counting{total_unmethylated_CHH_count}++; |
2977 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4226 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
2978 } | 4227 $mbias_1{CHH}->{$index+1}->{un}++; |
2979 elsif ($methylation_calls[$index] eq '.') { | 4228 } |
2980 } | 4229 elsif ($methylation_calls[$index] eq '.') {} |
4230 elsif (lc$methylation_calls[$index] eq 'u'){} | |
2981 else{ | 4231 else{ |
2982 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | 4232 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; |
2983 } | 4233 } |
2984 } | 4234 } |
2985 } | 4235 } |
2995 $pos_offset += $pos_mod; | 4245 $pos_offset += $pos_mod; |
2996 } | 4246 } |
2997 | 4247 |
2998 if ($methylation_calls[$index] eq 'X') { | 4248 if ($methylation_calls[$index] eq 'X') { |
2999 $counting{total_meCHG_count}++; | 4249 $counting{total_meCHG_count}++; |
3000 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4250 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4251 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3001 } | 4252 } |
3002 elsif ($methylation_calls[$index] eq 'x') { | 4253 elsif ($methylation_calls[$index] eq 'x') { |
3003 $counting{total_unmethylated_CHG_count}++; | 4254 $counting{total_unmethylated_CHG_count}++; |
3004 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4255 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4256 $mbias_1{CHG}->{$index+1}->{un}++; | |
3005 } | 4257 } |
3006 elsif ($methylation_calls[$index] eq 'Z') { | 4258 elsif ($methylation_calls[$index] eq 'Z') { |
3007 $counting{total_meCpG_count}++; | 4259 $counting{total_meCpG_count}++; |
3008 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4260 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4261 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3009 } | 4262 } |
3010 elsif ($methylation_calls[$index] eq 'z') { | 4263 elsif ($methylation_calls[$index] eq 'z') { |
3011 $counting{total_unmethylated_CpG_count}++; | 4264 $counting{total_unmethylated_CpG_count}++; |
3012 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4265 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4266 $mbias_1{CpG}->{$index+1}->{un}++; | |
3013 } | 4267 } |
3014 elsif ($methylation_calls[$index] eq 'H') { | 4268 elsif ($methylation_calls[$index] eq 'H') { |
3015 $counting{total_meCHH_count}++; | 4269 $counting{total_meCHH_count}++; |
3016 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4270 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4271 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3017 } | 4272 } |
3018 elsif ($methylation_calls[$index] eq 'h') { | 4273 elsif ($methylation_calls[$index] eq 'h') { |
3019 $counting{total_unmethylated_CHH_count}++; | 4274 $counting{total_unmethylated_CHH_count}++; |
3020 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4275 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3021 } | 4276 $mbias_1{CHH}->{$index+1}->{un}++; |
3022 elsif ($methylation_calls[$index] eq '.') { | 4277 } |
3023 } | 4278 elsif ($methylation_calls[$index] eq '.') {} |
4279 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3024 else{ | 4280 else{ |
3025 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | 4281 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; |
3026 } | 4282 } |
3027 } | 4283 } |
3028 } | 4284 } |
3042 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | 4298 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels |
3043 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | 4299 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); |
3044 $cigar_offset += $cigar_mod; | 4300 $cigar_offset += $cigar_mod; |
3045 $pos_offset += $pos_mod; | 4301 $pos_offset += $pos_mod; |
3046 } | 4302 } |
3047 | 4303 |
3048 if ($methylation_calls[$index] eq 'X') { | 4304 if ($methylation_calls[$index] eq 'X') { |
3049 $counting{total_meCHG_count}++; | 4305 $counting{total_meCHG_count}++; |
3050 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4306 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3051 } elsif ($methylation_calls[$index] eq 'x') { | 4307 $mbias_1{CHG}->{$index+1}->{meth}++; |
4308 } | |
4309 elsif ($methylation_calls[$index] eq 'x') { | |
3052 $counting{total_unmethylated_CHG_count}++; | 4310 $counting{total_unmethylated_CHG_count}++; |
3053 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4311 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3054 } elsif ($methylation_calls[$index] eq 'Z') { | 4312 $mbias_1{CHG}->{$index+1}->{un}++; |
4313 } | |
4314 elsif ($methylation_calls[$index] eq 'Z') { | |
3055 $counting{total_meCpG_count}++; | 4315 $counting{total_meCpG_count}++; |
3056 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4316 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3057 } elsif ($methylation_calls[$index] eq 'z') { | 4317 $mbias_1{CpG}->{$index+1}->{meth}++; |
4318 } | |
4319 elsif ($methylation_calls[$index] eq 'z') { | |
3058 $counting{total_unmethylated_CpG_count}++; | 4320 $counting{total_unmethylated_CpG_count}++; |
3059 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4321 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3060 } elsif ($methylation_calls[$index] eq 'H') { | 4322 $mbias_1{CpG}->{$index+1}->{un}++; |
4323 } | |
4324 elsif ($methylation_calls[$index] eq 'H') { | |
3061 $counting{total_meCHH_count}++; | 4325 $counting{total_meCHH_count}++; |
3062 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4326 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3063 } elsif ($methylation_calls[$index] eq 'h') { | 4327 $mbias_1{CHH}->{$index+1}->{meth}++; |
4328 } | |
4329 elsif ($methylation_calls[$index] eq 'h') { | |
3064 $counting{total_unmethylated_CHH_count}++; | 4330 $counting{total_unmethylated_CHH_count}++; |
3065 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4331 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4332 $mbias_1{CHH}->{$index+1}->{un}++; | |
3066 } | 4333 } |
3067 elsif ($methylation_calls[$index] eq '.') {} | 4334 elsif ($methylation_calls[$index] eq '.') {} |
4335 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3068 else{ | 4336 else{ |
3069 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | 4337 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); |
3070 } | 4338 } |
3071 } | 4339 } |
3072 } | 4340 } |
3073 elsif ($strand eq '-') { | 4341 elsif ($strand eq '-') { |
3074 | 4342 |
3082 $pos_offset += $pos_mod; | 4350 $pos_offset += $pos_mod; |
3083 } | 4351 } |
3084 | 4352 |
3085 if ($methylation_calls[$index] eq 'X') { | 4353 if ($methylation_calls[$index] eq 'X') { |
3086 $counting{total_meCHG_count}++; | 4354 $counting{total_meCHG_count}++; |
3087 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4355 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3088 } elsif ($methylation_calls[$index] eq 'x') { | 4356 $mbias_1{CHG}->{$index+1}->{meth}++; |
4357 } | |
4358 elsif ($methylation_calls[$index] eq 'x') { | |
3089 $counting{total_unmethylated_CHG_count}++; | 4359 $counting{total_unmethylated_CHG_count}++; |
3090 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4360 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3091 } elsif ($methylation_calls[$index] eq 'Z') { | 4361 $mbias_1{CHG}->{$index+1}->{un}++; |
4362 } | |
4363 elsif ($methylation_calls[$index] eq 'Z') { | |
3092 $counting{total_meCpG_count}++; | 4364 $counting{total_meCpG_count}++; |
3093 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4365 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3094 } elsif ($methylation_calls[$index] eq 'z') { | 4366 $mbias_1{CpG}->{$index+1}->{meth}++; |
4367 } | |
4368 elsif ($methylation_calls[$index] eq 'z') { | |
3095 $counting{total_unmethylated_CpG_count}++; | 4369 $counting{total_unmethylated_CpG_count}++; |
3096 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4370 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3097 } elsif ($methylation_calls[$index] eq 'H') { | 4371 $mbias_1{CpG}->{$index+1}->{un}++; |
4372 } | |
4373 elsif ($methylation_calls[$index] eq 'H') { | |
3098 $counting{total_meCHH_count}++; | 4374 $counting{total_meCHH_count}++; |
3099 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4375 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3100 } elsif ($methylation_calls[$index] eq 'h') { | 4376 $mbias_1{CHH}->{$index+1}->{meth}++; |
4377 } | |
4378 elsif ($methylation_calls[$index] eq 'h') { | |
3101 $counting{total_unmethylated_CHH_count}++; | 4379 $counting{total_unmethylated_CHH_count}++; |
3102 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4380 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4381 $mbias_1{CHH}->{$index+1}->{un}++; | |
3103 } | 4382 } |
3104 elsif ($methylation_calls[$index] eq '.') {} | 4383 elsif ($methylation_calls[$index] eq '.') {} |
4384 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3105 else{ | 4385 else{ |
3106 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | 4386 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; |
3107 } | 4387 } |
3108 } | 4388 } |
3109 } | 4389 } |
3125 $pos_offset += $pos_mod; | 4405 $pos_offset += $pos_mod; |
3126 } | 4406 } |
3127 | 4407 |
3128 if ($methylation_calls[$index] eq 'X') { | 4408 if ($methylation_calls[$index] eq 'X') { |
3129 $counting{total_meCHG_count}++; | 4409 $counting{total_meCHG_count}++; |
3130 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4410 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3131 } elsif ($methylation_calls[$index] eq 'x') { | 4411 $mbias_1{CHG}->{$index+1}->{meth}++; |
4412 } | |
4413 elsif ($methylation_calls[$index] eq 'x') { | |
3132 $counting{total_unmethylated_CHG_count}++; | 4414 $counting{total_unmethylated_CHG_count}++; |
3133 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4415 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3134 } elsif ($methylation_calls[$index] eq 'Z') { | 4416 $mbias_1{CHG}->{$index+1}->{un}++; |
4417 } | |
4418 elsif ($methylation_calls[$index] eq 'Z') { | |
3135 $counting{total_meCpG_count}++; | 4419 $counting{total_meCpG_count}++; |
3136 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4420 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3137 } elsif ($methylation_calls[$index] eq 'z') { | 4421 $mbias_1{CpG}->{$index+1}->{meth}++; |
4422 } | |
4423 elsif ($methylation_calls[$index] eq 'z') { | |
3138 $counting{total_unmethylated_CpG_count}++; | 4424 $counting{total_unmethylated_CpG_count}++; |
3139 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4425 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3140 } elsif ($methylation_calls[$index] eq 'H') { | 4426 $mbias_1{CpG}->{$index+1}->{un}++; |
4427 } | |
4428 elsif ($methylation_calls[$index] eq 'H') { | |
3141 $counting{total_meCHH_count}++; | 4429 $counting{total_meCHH_count}++; |
3142 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4430 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3143 } elsif ($methylation_calls[$index] eq 'h') { | 4431 $mbias_1{CHH}->{$index+1}->{meth}++; |
4432 } | |
4433 elsif ($methylation_calls[$index] eq 'h') { | |
3144 $counting{total_unmethylated_CHH_count}++; | 4434 $counting{total_unmethylated_CHH_count}++; |
3145 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4435 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4436 $mbias_1{CHH}->{$index+1}->{un}++; | |
3146 } | 4437 } |
3147 elsif ($methylation_calls[$index] eq '.') {} | 4438 elsif ($methylation_calls[$index] eq '.') {} |
4439 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3148 else{ | 4440 else{ |
3149 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | 4441 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; |
3150 } | 4442 } |
3151 } | 4443 } |
3152 } | 4444 } |
3162 $pos_offset += $pos_mod; | 4454 $pos_offset += $pos_mod; |
3163 } | 4455 } |
3164 | 4456 |
3165 if ($methylation_calls[$index] eq 'X') { | 4457 if ($methylation_calls[$index] eq 'X') { |
3166 $counting{total_meCHG_count}++; | 4458 $counting{total_meCHG_count}++; |
3167 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4459 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3168 } elsif ($methylation_calls[$index] eq 'x') { | 4460 $mbias_1{CHG}->{$index+1}->{meth}++; |
4461 } | |
4462 elsif ($methylation_calls[$index] eq 'x') { | |
3169 $counting{total_unmethylated_CHG_count}++; | 4463 $counting{total_unmethylated_CHG_count}++; |
3170 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4464 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3171 } elsif ($methylation_calls[$index] eq 'Z') { | 4465 $mbias_1{CHG}->{$index+1}->{un}++; |
4466 } | |
4467 elsif ($methylation_calls[$index] eq 'Z') { | |
3172 $counting{total_meCpG_count}++; | 4468 $counting{total_meCpG_count}++; |
3173 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4469 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3174 } elsif ($methylation_calls[$index] eq 'z') { | 4470 $mbias_1{CpG}->{$index+1}->{meth}++; |
4471 } | |
4472 elsif ($methylation_calls[$index] eq 'z') { | |
3175 $counting{total_unmethylated_CpG_count}++; | 4473 $counting{total_unmethylated_CpG_count}++; |
3176 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4474 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3177 } elsif ($methylation_calls[$index] eq 'H') { | 4475 $mbias_1{CpG}->{$index+1}->{un}++; |
4476 } | |
4477 elsif ($methylation_calls[$index] eq 'H') { | |
3178 $counting{total_meCHH_count}++; | 4478 $counting{total_meCHH_count}++; |
3179 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4479 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
3180 } elsif ($methylation_calls[$index] eq 'h') { | 4480 $mbias_1{CHH}->{$index+1}->{meth}++; |
4481 } | |
4482 elsif ($methylation_calls[$index] eq 'h') { | |
3181 $counting{total_unmethylated_CHH_count}++; | 4483 $counting{total_unmethylated_CHH_count}++; |
3182 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; | 4484 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); |
4485 $mbias_1{CHH}->{$index+1}->{un}++; | |
3183 } | 4486 } |
3184 elsif ($methylation_calls[$index] eq '.') {} | 4487 elsif ($methylation_calls[$index] eq '.') {} |
4488 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3185 else{ | 4489 else{ |
3186 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | 4490 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; |
3187 } | 4491 } |
3188 } | 4492 } |
3189 } | 4493 } |
3190 else { | 4494 else { |
3191 die "The strand information was neither + nor -: $strand\n"; | 4495 die "The strand information was neither + nor -: $strand\n"; |
3192 } | 4496 } |
3193 } | 4497 } |
3194 } | 4498 } |
3195 | |
3196 | |
3197 | |
3198 ####################################################################################################################################### | |
3199 ### bismark2bedGaph section - START | |
3200 ####################################################################################################################################### | |
3201 | |
3202 ### has now been moved to the external script bismark2bedGraph | |
3203 | |
3204 # sub process_bedGraph_output{ | |
3205 # warn "="x64,"\n"; | |
3206 # warn "Methylation information will now be written into a bedGraph file\n"; | |
3207 # warn "="x64,"\n\n"; | |
3208 # sleep (2); | |
3209 | |
3210 # ### Closing all filehandles so that the Bismark methtylation extractor output doesn't get truncated due to buffering issues | |
3211 # foreach my $fh (keys %fhs) { | |
3212 # if ($fh =~ /^[1230]$/) { | |
3213 # foreach my $context (keys %{$fhs{$fh}}) { | |
3214 # close $fhs{$fh}->{$context} or die $!; | |
3215 # } | |
3216 # } else { | |
3217 # close $fhs{$fh} or die $!; | |
3218 # } | |
3219 # } | |
3220 | |
3221 # ### deciding which files to use for bedGraph conversion | |
3222 # foreach my $filename (@sorting_files){ | |
3223 # # warn "$filename\n"; | |
3224 # if ($filename =~ /\//){ # if files are in a different output folder we extract the filename again | |
3225 # $filename =~ s/.*\///; # replacing everything up to the last slash in the filename | |
3226 # # warn "$filename\n"; | |
3227 # } | |
3228 | |
3229 # if ($CX_context){ | |
3230 # push @bedfiles,$filename; | |
3231 # } | |
3232 # else{ ## CpG context only (default) | |
3233 # if ($filename =~ /^CpG_/){ | |
3234 # push @bedfiles,$filename; | |
3235 # } | |
3236 # else{ | |
3237 # # skipping CHH or CHG files | |
3238 # } | |
3239 # } | |
3240 # } | |
3241 | |
3242 # warn "Using the following files as Input:\n"; | |
3243 # print join ("\t",@bedfiles),"\n\n"; | |
3244 # sleep (2); | |
3245 | |
3246 # my %temp_fhs; | |
3247 # my @temp_files; # writing all context files (default CpG only) to these files prior to sorting | |
3248 | |
3249 # ### changing to the output directory | |
3250 # unless ($output_dir eq ''){ # default | |
3251 # chdir $output_dir or die "Failed to change directory to $output_dir\n"; | |
3252 # warn "Changed directory to $output_dir\n"; | |
3253 # } | |
3254 | |
3255 # foreach my $infile (@bedfiles) { | |
3256 | |
3257 # if ($remove) { | |
3258 # warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n"; | |
3259 | |
3260 # if ($infile =~ /gz$/){ | |
3261 # open (READ,"zcat $infile |") or die $!; | |
3262 # } | |
3263 # else{ | |
3264 # open (READ,$infile) or die $!; | |
3265 # } | |
3266 | |
3267 # my $removed_spaces_outfile = $infile; | |
3268 # $removed_spaces_outfile =~ s/$/.spaces_removed.txt/; | |
3269 | |
3270 # open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n"; | |
3271 | |
3272 # unless ($no_header){ | |
3273 # $_ = <READ>; ### Bismark version header | |
3274 # print REM $_; ### Bismark version header | |
3275 # } | |
3276 | |
3277 # while (<READ>) { | |
3278 # chomp; | |
3279 # my ($id,$strand,$chr,$pos,$context) = (split (/\t/)); | |
3280 # $id =~ s/\s+/_/g; | |
3281 # print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n"; | |
3282 # } | |
3283 | |
3284 # close READ or die $!; | |
3285 # close REM or die $!; | |
3286 | |
3287 # ### changing the infile name to the new file without spaces | |
3288 # $infile = $removed_spaces_outfile; | |
3289 # } | |
3290 | |
3291 # warn "Now writing methylation information for file $infile to individual files for each chromosome\n"; | |
3292 # if ($infile =~ /gz$/){ | |
3293 # open (IN,"zcat $infile |") or die $!; | |
3294 # } | |
3295 # else{ | |
3296 # open (IN,$infile) or die $!; | |
3297 # } | |
3298 | |
3299 # ## always ignoring the version header | |
3300 # unless ($no_header){ | |
3301 # $_ = <IN>; ### Bismark version header | |
3302 # } | |
3303 | |
3304 # while (<IN>) { | |
3305 # chomp; | |
3306 # my ($chr) = (split (/\t/))[2]; | |
3307 # # warn "This is the chromosome name before replacing '|' characters:\t$chr\n\n"; | |
3308 # $chr =~ s/\|/_/g; # replacing pipe ('|') characters in the file names | |
3309 # # warn "This is the chromosome name AFTER replacing '|' characters:\t$chr\n\n"; | |
3310 | |
3311 # unless (exists $temp_fhs{$chr}) { | |
3312 # open ($temp_fhs{$chr},'>','chr'.$chr.'.meth_extractor.temp') or die "Failed to open filehandle: $!"; | |
3313 # } | |
3314 # print {$temp_fhs{$chr}} "$_\n"; | |
3315 # } | |
3316 | |
3317 # warn "Finished writing out individual chromosome files for $infile\n"; | |
3318 # } | |
3319 # warn "\n"; | |
3320 | |
3321 # @temp_files = <*.meth_extractor.temp>; | |
3322 | |
3323 # warn "Collecting temporary chromosome file information...\n"; | |
3324 # sleep (1); | |
3325 # warn "processing the following input file(s):\n"; | |
3326 # warn join ("\n",@temp_files),"\n\n"; | |
3327 # sleep (1); | |
3328 | |
3329 # foreach my $in (@temp_files) { | |
3330 # if ($sort_size){ | |
3331 # warn "Sorting input file $in by positions (using -S of '$sort_size')\n"; | |
3332 # } | |
3333 # else{ | |
3334 # warn "Sorting input file $in by positions (using default memory settings)\n"; | |
3335 # } | |
3336 # my $sort_dir = $output_dir; | |
3337 # if ($sort_dir eq ''){ | |
3338 # $sort_dir = './'; | |
3339 # } | |
3340 # open my $ifh, "sort -S $sort_size -T $sort_dir -k3,3 -k4,4n $in |" or die "Input file could not be sorted. $!"; | |
3341 # # print "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n"; | |
3342 | |
3343 # ############################################# m.a.bentley - moved the variables out of the while loop to hold the current line data { | |
3344 | |
3345 # my $name; | |
3346 # my $meth_state; | |
3347 # my $chr = ""; | |
3348 # my $pos = 0; | |
3349 # my $meth_state2; | |
3350 | |
3351 # my $last_pos; | |
3352 # my $last_chr; | |
3353 | |
3354 # ############################################# } | |
3355 | |
3356 # while (my $line = <$ifh>) { | |
3357 # next if $line =~ /^Bismark/; | |
3358 # chomp $line; | |
3359 | |
3360 # ########################################### m.a.bentley - (1) set the last_chr and last_pos variables early in the while loop, before the line split (2) removed unnecessary setting of same variables in if statement { | |
3361 | |
3362 # $last_chr = $chr; | |
3363 # $last_pos = $pos; | |
3364 # ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line; | |
3365 | |
3366 # if (($last_pos ne $pos) || ($last_chr ne $chr)) { | |
3367 # generate_output($last_chr,$last_pos) if $methylcalls[2] > 0; | |
3368 # @methylcalls = qw (0 0 0); | |
3369 # } | |
3370 | |
3371 # ############################################# } | |
3372 | |
3373 # my $validated = validate_methylation_call($meth_state, $meth_state2); | |
3374 # unless($validated){ | |
3375 # warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n"; | |
3376 # next; | |
3377 # } | |
3378 # if ($meth_state eq "+") { | |
3379 # $methylcalls[0]++; | |
3380 # $methylcalls[2]++; | |
3381 # } else { | |
3382 # $methylcalls[1]++; | |
3383 # $methylcalls[2]++; | |
3384 # } | |
3385 # } | |
3386 | |
3387 # ############################################# m.a.bentley - set the last_chr and last_pos variables for the last line in the file (outside the while loop's scope using the method i've implemented) { | |
3388 | |
3389 # $last_chr = $chr; | |
3390 # $last_pos = $pos; | |
3391 # if ($methylcalls[2] > 0) { | |
3392 # generate_output($last_chr,$last_pos) if $methylcalls[2] > 0; | |
3393 # } | |
3394 # ############################################# } | |
3395 | |
3396 # close $ifh or die $!; | |
3397 | |
3398 # @methylcalls = qw (0 0 0); # resetting @methylcalls | |
3399 | |
3400 # ### deleting temporary files | |
3401 # my $delete = unlink $in; | |
3402 # if ($delete) { | |
3403 # warn "Successfully deleted the temporary input file $in\n\n"; | |
3404 # } | |
3405 # else { | |
3406 # warn "The temporary inputfile $in could not be deleted $!\n\n"; | |
3407 # } | |
3408 # } | |
3409 # } | |
3410 | |
3411 # sub generate_output{ | |
3412 # my $methcount = $methylcalls[0]; | |
3413 # my $nonmethcount = $methylcalls[1]; | |
3414 # my $totalcount = $methylcalls[2]; | |
3415 # my $last_chr = shift; | |
3416 # my $last_pos = shift; | |
3417 # croak "Should not be generating output if there's no reads to this region" unless $totalcount > 0; | |
3418 # croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if $totalcount != ($methcount + $nonmethcount); | |
3419 | |
3420 # ############################################# m.a.bentley - declare a new variable 'bed_pos' to distinguish from bismark positions (-1) - previous scripts modified the last_pos variable earlier in the script leading to problems in meth % calculation { | |
3421 | |
3422 # my $bed_pos = $last_pos -1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based. | |
3423 # my $meth_percentage; | |
3424 # ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef); | |
3425 # # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/; | |
3426 # if (defined $meth_percentage){ | |
3427 # if ($counts){ | |
3428 # print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\t$methcount\t$nonmethcount\n"; | |
3429 # } | |
3430 # else{ | |
3431 # print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\n"; | |
3432 # } | |
3433 # } | |
3434 # ############################################# } | |
3435 # } | |
3436 | |
3437 # sub validate_methylation_call{ | |
3438 # my $meth_state = shift; | |
3439 # croak "Missing (+/-) methylation call" unless defined $meth_state; | |
3440 # my $meth_state2 = shift; | |
3441 # croak "Missing alphabetical methylation call" unless defined $meth_state2; | |
3442 # my $is_consistent; | |
3443 # ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2)) | |
3444 # : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2)); | |
3445 # return 1 if $is_consistent; | |
3446 # return 0; | |
3447 # } | |
3448 | |
3449 # sub check_CpG_methylation_call{ | |
3450 # my $meth1 = shift; | |
3451 # my $meth2 = shift; | |
3452 # return 1 if($meth1 eq "+" && $meth2 eq "Z"); | |
3453 # return 1 if($meth1 eq "-" && $meth2 eq "z"); | |
3454 # return 0; | |
3455 # } | |
3456 | |
3457 # sub check_nonCpG_methylation_call{ | |
3458 # my $meth1 = shift; | |
3459 # my $meth2 = shift; | |
3460 # return 1 if($meth1 eq "+" && $meth2 eq "C"); | |
3461 # return 1 if($meth1 eq "+" && $meth2 eq "X"); | |
3462 # return 1 if($meth1 eq "+" && $meth2 eq "H"); | |
3463 # return 1 if($meth1 eq "-" && $meth2 eq "c"); | |
3464 # return 1 if($meth1 eq "-" && $meth2 eq "x"); | |
3465 # return 1 if($meth1 eq "-" && $meth2 eq "h"); | |
3466 # return 0; | |
3467 # } | |
3468 | |
3469 ####################################################################################################################################### | |
3470 ### bismark2bedGaph section - END | |
3471 ####################################################################################################################################### | |
3472 | |
3473 | |
3474 | |
3475 | |
3476 | |
3477 | |
3478 # ####################################################################################################################################### | |
3479 # ### genome-wide cytosine methylation report - START | |
3480 # ####################################################################################################################################### | |
3481 | |
3482 ### has now been moved to the external script bedGraph2cytosine | |
3483 | |
3484 # sub generate_genome_wide_cytosine_report { | |
3485 | |
3486 # warn "="x78,"\n"; | |
3487 # warn "Methylation information will now be written into a genome-wide cytosine report\n"; | |
3488 # warn "="x78,"\n\n"; | |
3489 # sleep (2); | |
3490 | |
3491 # ### changing to the output directory again | |
3492 # unless ($output_dir eq ''){ # default | |
3493 # chdir $output_dir or die "Failed to change directory to $output_dir\n"; | |
3494 # # warn "Changed directory to $output_dir\n"; | |
3495 # } | |
3496 | |
3497 # my $in = shift; | |
3498 # open (IN,$in) or die $!; | |
3499 | |
3500 # my $cytosine_out = shift; | |
3501 | |
3502 # if ($CX_context){ | |
3503 # $cytosine_out =~ s/$/genome-wide_CX_report.txt/; | |
3504 # } | |
3505 # else{ | |
3506 # $cytosine_out =~ s/$/genome_wide_CpG_report.txt/; | |
3507 # } | |
3508 | |
3509 # ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands | |
3510 # unless ($split_by_chromosome){ ### writing all output to a single file (default) | |
3511 # open (CYT,'>',$cytosine_out) or die $!; | |
3512 # warn "Writing genome-wide cytosine report to: $cytosine_out\n"; | |
3513 # sleep (3); | |
3514 # } | |
3515 | |
3516 # my $last_chr; | |
3517 # my %chr; # storing reads for one chromosome at a time | |
3518 | |
3519 # my $count = 0; | |
3520 # while (<IN>){ | |
3521 # chomp; | |
3522 # ++$count; | |
3523 # my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/); | |
3524 | |
3525 # # defining the first chromosome | |
3526 # unless (defined $last_chr){ | |
3527 # $last_chr = $chr; | |
3528 # # warn "Storing all covered cytosine positions for chromosome: $chr\n"; | |
3529 # } | |
3530 | |
3531 # if ($chr eq $last_chr){ | |
3532 # $chr{$chr}->{$start}->{meth} = $meth; | |
3533 # $chr{$chr}->{$start}->{nonmeth} = $nonmeth; | |
3534 # } | |
3535 # else{ | |
3536 # warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; | |
3537 | |
3538 # if ($split_by_chromosome){ ## writing output to 1 file per chromosome | |
3539 # my $chromosome_out = $cytosine_out; | |
3540 # $chromosome_out =~ s/txt$/chr${last_chr}.txt/; | |
3541 # open (CYT,'>',$chromosome_out) or die $!; | |
3542 # } | |
3543 | |
3544 # while ( $chromosomes{$last_chr} =~ /([CG])/g){ | |
3545 | |
3546 # my $tri_nt = ''; | |
3547 # my $context = ''; | |
3548 # my $pos = pos$chromosomes{$last_chr}; | |
3549 | |
3550 # my $strand; | |
3551 # my $meth = 0; | |
3552 # my $nonmeth = 0; | |
3553 | |
3554 # if ($1 eq 'C'){ # C on forward strand | |
3555 # $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! | |
3556 # $strand = '+'; | |
3557 # } | |
3558 # elsif ($1 eq 'G'){ # C on reverse strand | |
3559 # $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! | |
3560 # $tri_nt = reverse $tri_nt; | |
3561 # $tri_nt =~ tr/ACTG/TGAC/; | |
3562 # $strand = '-'; | |
3563 # } | |
3564 # next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted | |
3565 | |
3566 # if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based! | |
3567 # $meth = $chr{$last_chr}->{$pos-1}->{meth}; | |
3568 # $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth}; | |
3569 # } | |
3570 | |
3571 # ### determining cytosine context | |
3572 # if ($tri_nt =~ /^CG/){ | |
3573 # $context = 'CG'; | |
3574 # } | |
3575 # elsif ($tri_nt =~ /^C.{1}G$/){ | |
3576 # $context = 'CHG'; | |
3577 # } | |
3578 # elsif ($tri_nt =~ /^C.{2}$/){ | |
3579 # $context = 'CHH'; | |
3580 # } | |
3581 # else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) | |
3582 # warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n"; | |
3583 # next; | |
3584 # } | |
3585 | |
3586 # if ($CpG_only){ | |
3587 # if ($tri_nt =~ /^CG/){ # CpG context is the default | |
3588 # if ($zero){ # zero based coordinates | |
3589 # $pos -= 1; | |
3590 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3591 # } | |
3592 # else{ # default | |
3593 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3594 # } | |
3595 # } | |
3596 # } | |
3597 # else{ ## all cytosines, specified with --CX | |
3598 # if ($zero){ # zero based coordinates | |
3599 # $pos -= 1; | |
3600 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3601 # } | |
3602 # else{ # default | |
3603 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3604 # } | |
3605 # } | |
3606 # } | |
3607 | |
3608 # %chr = (); # resetting the hash | |
3609 | |
3610 # # new first entry | |
3611 # $last_chr = $chr; | |
3612 # $chr{$chr}->{$start}->{meth} = $meth; | |
3613 # $chr{$chr}->{$start}->{nonmeth} = $nonmeth; | |
3614 # } | |
3615 # } | |
3616 | |
3617 # # Last found chromosome | |
3618 # warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; | |
3619 | |
3620 # if ($split_by_chromosome){ ## writing output to 1 file per chromosome | |
3621 # my $chromosome_out = $cytosine_out; | |
3622 # $chromosome_out =~ s/txt$/chr${last_chr}.txt/; | |
3623 # open (CYT,'>',$chromosome_out) or die $!; | |
3624 # } | |
3625 | |
3626 # while ( $chromosomes{$last_chr} =~ /([CG])/g){ | |
3627 | |
3628 # my $tri_nt; | |
3629 # my $context; | |
3630 # my $pos = pos$chromosomes{$last_chr}; | |
3631 | |
3632 # my $strand; | |
3633 # my $meth = 0; | |
3634 # my $nonmeth = 0; | |
3635 | |
3636 # if ($1 eq 'C'){ # C on forward strand | |
3637 # $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! | |
3638 # $strand = '+'; | |
3639 # } | |
3640 # elsif ($1 eq 'G'){ # C on reverse strand | |
3641 # $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! | |
3642 # $tri_nt = reverse $tri_nt; | |
3643 # $tri_nt =~ tr/ACTG/TGAC/; | |
3644 # $strand = '-'; | |
3645 # } | |
3646 | |
3647 # if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based! | |
3648 # $meth = $chr{$last_chr}->{$pos-1}->{meth}; | |
3649 # $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth}; | |
3650 # } | |
3651 | |
3652 # next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted | |
3653 | |
3654 # ### determining cytosine context | |
3655 # if ($tri_nt =~ /^CG/){ | |
3656 # $context = 'CG'; | |
3657 # } | |
3658 # elsif ($tri_nt =~ /^C.{1}G$/){ | |
3659 # $context = 'CHG'; | |
3660 # } | |
3661 # elsif ($tri_nt =~ /^C.{2}$/){ | |
3662 # $context = 'CHH'; | |
3663 # } | |
3664 # else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) | |
3665 # warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n"; | |
3666 # next; | |
3667 # } | |
3668 | |
3669 # if ($CpG_only){ | |
3670 # if ($tri_nt =~ /^CG/){ # CpG context is the default | |
3671 # if ($zero){ # zero-based coordinates | |
3672 # $pos -= 1; | |
3673 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3674 # } | |
3675 # else{ # default | |
3676 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3677 # } | |
3678 # } | |
3679 # } | |
3680 # else{ ## all cytosines, specified with --CX | |
3681 # if ($zero){ # zero based coordinates | |
3682 # $pos -= 1; | |
3683 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3684 # } | |
3685 # else{ # default | |
3686 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; | |
3687 # } | |
3688 # } | |
3689 # } | |
3690 # close CYT or die $!; | |
3691 # } | |
3692 | |
3693 | |
3694 # sub read_genome_into_memory{ | |
3695 | |
3696 # ## reading in and storing the specified genome in the %chromosomes hash | |
3697 # chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; | |
3698 # warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; | |
3699 | |
3700 # my @chromosome_filenames = <*.fa>; | |
3701 | |
3702 # ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta | |
3703 # unless (@chromosome_filenames){ | |
3704 # @chromosome_filenames = <*.fasta>; | |
3705 # } | |
3706 # unless (@chromosome_filenames){ | |
3707 # die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; | |
3708 # } | |
3709 | |
3710 # foreach my $chromosome_filename (@chromosome_filenames){ | |
3711 | |
3712 # # skipping the tophat entire mouse genome fasta file | |
3713 # next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa'); | |
3714 | |
3715 # open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; | |
3716 # ### first line needs to be a fastA header | |
3717 # my $first_line = <CHR_IN>; | |
3718 # chomp $first_line; | |
3719 # $first_line =~ s/\r//; # removing /r carriage returns | |
3720 | |
3721 # ### Extracting chromosome name from the FastA header | |
3722 # my $chromosome_name = extract_chromosome_name($first_line); | |
3723 | |
3724 # my $sequence; | |
3725 # while (<CHR_IN>){ | |
3726 # chomp; | |
3727 # $_ =~ s/\r//; # removing /r carriage returns | |
3728 | |
3729 # if ($_ =~ /^>/){ | |
3730 # ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) | |
3731 # if (exists $chromosomes{$chromosome_name}){ | |
3732 # warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
3733 # die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; | |
3734 # } | |
3735 # else { | |
3736 # if (length($sequence) == 0){ | |
3737 # warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; | |
3738 # } | |
3739 # warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
3740 # $chromosomes{$chromosome_name} = $sequence; | |
3741 # } | |
3742 # ### resetting the sequence variable | |
3743 # $sequence = ''; | |
3744 # ### setting new chromosome name | |
3745 # $chromosome_name = extract_chromosome_name($_); | |
3746 # } | |
3747 # else{ | |
3748 # $sequence .= uc$_; | |
3749 # } | |
3750 # } | |
3751 | |
3752 # if (exists $chromosomes{$chromosome_name}){ | |
3753 # warn "chr $chromosome_name (",length $sequence ," bp)\t"; | |
3754 # die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; | |
3755 # } | |
3756 # else{ | |
3757 # if (length($sequence) == 0){ | |
3758 # warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; | |
3759 # } | |
3760 # warn "chr $chromosome_name (",length $sequence ," bp)\n"; | |
3761 # $chromosomes{$chromosome_name} = $sequence; | |
3762 # } | |
3763 # } | |
3764 # warn "\n"; | |
3765 # chdir $parent_dir or die "Failed to move to directory $parent_dir\n"; | |
3766 # } | |
3767 | |
3768 # sub extract_chromosome_name { | |
3769 # ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well | |
3770 # my $fasta_header = shift; | |
3771 # if ($fasta_header =~ s/^>//){ | |
3772 # my ($chromosome_name) = split (/\s+/,$fasta_header); | |
3773 # return $chromosome_name; | |
3774 # } | |
3775 # else{ | |
3776 # die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; | |
3777 # } | |
3778 # } | |
3779 | |
3780 # ####################################################################################################################################### | |
3781 # ### genome-wide cytosine methylation report - END | |
3782 # ####################################################################################################################################### | |
3783 | |
3784 | 4499 |
3785 | 4500 |
3786 | 4501 |
3787 sub print_helpfile{ | 4502 sub print_helpfile{ |
3788 | 4503 |
3795 methylation extractor. The script reads in a bisulfite read alignment results file | 4510 methylation extractor. The script reads in a bisulfite read alignment results file |
3796 produced by the Bismark bisulfite mapper and extracts the methylation information | 4511 produced by the Bismark bisulfite mapper and extracts the methylation information |
3797 for individual cytosines. This information is found in the methylation call field | 4512 for individual cytosines. This information is found in the methylation call field |
3798 which can contain the following characters: | 4513 which can contain the following characters: |
3799 | 4514 |
3800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 4515 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
3801 ~~~ X for methylated C in CHG context (was protected) ~~~ | 4516 ~~~ X for methylated C in CHG context ~~~ |
3802 ~~~ x for not methylated C CHG (was converted) ~~~ | 4517 ~~~ x for not methylated C CHG ~~~ |
3803 ~~~ H for methylated C in CHH context (was protected) ~~~ | 4518 ~~~ H for methylated C in CHH context ~~~ |
3804 ~~~ h for not methylated C in CHH context (was converted) ~~~ | 4519 ~~~ h for not methylated C in CHH context ~~~ |
3805 ~~~ Z for methylated C in CpG context (was protected) ~~~ | 4520 ~~~ Z for methylated C in CpG context ~~~ |
3806 ~~~ z for not methylated C in CpG context (was converted) ~~~ | 4521 ~~~ z for not methylated C in CpG context ~~~ |
3807 ~~~ . for any bases not involving cytosines ~~~ | 4522 ~~~ U for methylated C in Unknown context (CN or CHN ~~~ |
3808 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 4523 ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~ |
4524 ~~~ . for any bases not involving cytosines ~~~ | |
4525 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
3809 | 4526 |
3810 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH | 4527 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH |
3811 context (this distinction is actually already made in Bismark itself). As the methylation | 4528 context (this distinction is actually already made in Bismark itself). As the methylation |
3812 information for every C analysed can produce files which easily have tens or even hundreds of | 4529 information for every C analysed can produce files which easily have tens or even hundreds of |
3813 millions of lines, file sizes can become very large and more difficult to handle. The C | 4530 millions of lines, file sizes can become very large and more difficult to handle. The C |
3837 | 4554 |
3838 USAGE: methylation_extractor [options] <filenames> | 4555 USAGE: methylation_extractor [options] <filenames> |
3839 | 4556 |
3840 | 4557 |
3841 ARGUMENTS: | 4558 ARGUMENTS: |
4559 ========== | |
3842 | 4560 |
3843 <filenames> A space-separated list of Bismark result files in SAM format from | 4561 <filenames> A space-separated list of Bismark result files in SAM format from |
3844 which methylation information is extracted for every cytosine in | 4562 which methylation information is extracted for every cytosine in |
3845 the reads. For alignment files in the older custom Bismark output | 4563 the reads. For alignment files in the older custom Bismark output |
3846 see option '--vanilla'. | 4564 see option '--vanilla'. |
3866 Whilst this option removes a bias towards more methylation calls | 4584 Whilst this option removes a bias towards more methylation calls |
3867 in the center of sequenced fragments it may de facto remove a sizable | 4585 in the center of sequenced fragments it may de facto remove a sizable |
3868 proportion of the data. This option is highly recommended for paired-end | 4586 proportion of the data. This option is highly recommended for paired-end |
3869 data. | 4587 data. |
3870 | 4588 |
3871 --ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the | 4589 --ignore <int> Ignore the first <int> bp from the 5' end of Read 1 when processing the |
3872 methylation call string. This can remove e.g. a restriction enzyme site | 4590 methylation call string. This can remove e.g. a restriction enzyme site |
3873 at the start of each read. | 4591 at the start of each read or any other source of bias (e.g. PBAT-Seq data). |
3874 | 4592 |
3875 --comprehensive Specifying this option will merge all four possible strand-specific | 4593 --ignore_r2 <int> Ignore the first <int> bp from the 5' end of Read 2 of paired-end sequencing |
3876 methylation info into context-dependent output files. The default | 4594 results only. Since the first couple of bases in Read 2 of BS-Seq experiments |
4595 show a severe bias towards non-methylation as a result of end-repairing | |
4596 sonicated fragments with unmethylated cytosines (see M-bias plot), it is | |
4597 recommended that the first couple of bp of Read 2 are removed before | |
4598 starting downstream analysis. Please see the section on M-bias plots in the | |
4599 Bismark User Guide for more details. | |
4600 | |
4601 --comprehensive Specifying this option will merge all four possible strand-specific | |
4602 methylation info into context-dependent output files. The default | |
4603 | |
3877 contexts are: | 4604 contexts are: |
3878 - CpG context | 4605 - CpG context |
3879 - CHG context | 4606 - CHG context |
3880 - CHH context | 4607 - CHH context |
3881 | 4608 |
3902 | 4629 |
3903 --version Displays version information. | 4630 --version Displays version information. |
3904 | 4631 |
3905 -h/--help Displays this help file and exits. | 4632 -h/--help Displays this help file and exits. |
3906 | 4633 |
4634 --mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as | |
4635 well as a report (optional) and then quit. Default: OFF. | |
4636 | |
3907 | 4637 |
3908 | 4638 |
3909 bedGraph specific options: | 4639 bedGraph specific options: |
4640 ========================== | |
3910 | 4641 |
3911 --bedGraph After finishing the methylation extraction, the methylation output is written into a | 4642 --bedGraph After finishing the methylation extraction, the methylation output is written into a |
3912 sorted bedGraph file that reports the position of a given cytosine and its methylation | 4643 sorted bedGraph file that reports the position of a given cytosine and its methylation |
3913 state (in %, see details below). The methylation extractor output is temporarily split up into | 4644 state (in %, see details below). The methylation extractor output is temporarily split up into |
3914 temporary files, one per chromosome (written into the current directory or folder | 4645 temporary files, one per chromosome (written into the current directory or folder |
3925 before its methylation percentage is reported. Default: 1. | 4656 before its methylation percentage is reported. Default: 1. |
3926 | 4657 |
3927 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. | 4658 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. |
3928 | 4659 |
3929 | 4660 |
3930 --counts Adds two additional columns to the output file to enable further calculations: | |
3931 col 5: number of methylated calls | |
3932 col 6: number of unmethylated calls | |
3933 This option is required if '--cytosine_report' is specified (and will be set automatically if | |
3934 necessary). | |
3935 | |
3936 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered | 4661 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered |
3937 in the experiment irrespective of its sequence context. This applies to both forward and | 4662 in the experiment irrespective of its sequence context. This applies to both forward and |
3938 reverse strands. Please be aware that this option may generate large temporary and output files | 4663 reverse strands. Please be aware that this option may generate large temporary and output files |
3939 and may take a long time to sort (up to many hours). Default: OFF. | 4664 and may take a long time to sort (up to many hours). Default: OFF. |
3940 (i.e. Default = CpG context only). | 4665 (i.e. Default = CpG context only). |
3943 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or | 4668 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or |
3944 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. | 4669 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. |
3945 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. | 4670 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. |
3946 Defaults to 2G. | 4671 Defaults to 2G. |
3947 | 4672 |
4673 --scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of | |
4674 scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to | |
4675 individual chromosome files. These errors were caused by the operating system's limit | |
4676 of the number of filehandle that can be written to at any one time (typically 1024; to | |
4677 find out this limit on Linux, type: ulimit -a). | |
4678 To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort | |
4679 methylation calls into individual chromosome files. Instead, all input files are | |
4680 temporarily merged into a single file (unless there is only a single file), and this | |
4681 file will then be sorted by both chromosome AND position using the Unix sort command. | |
4682 Please be aware that this option might take a looooong time to complete, depending on | |
4683 the size of the input files, and the memory you allocate to this process (see --buffer_size). | |
4684 Nevertheless, it seems to be working. | |
4685 | |
4686 --ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will | |
4687 instead use two arrays to sort methylated and unmethylated calls. This may result in a faster | |
4688 sorting process of very large files, but this comes at the cost of a larger memory footprint | |
4689 (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB | |
4690 of RAM). Due to overheads in creating and looping through these arrays it seems that it will | |
4691 actually be *slower* for small files (few million alignments), and we are currently testing at | |
4692 which point it is advisable to use this option. Note that --ample_memory is not compatible | |
4693 with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with). | |
4694 | |
4695 | |
3948 | 4696 |
3949 Genome-wide cytosine methylation report specific options: | 4697 Genome-wide cytosine methylation report specific options: |
4698 ========================================================= | |
3950 | 4699 |
3951 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a | 4700 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a |
3952 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based | 4701 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based |
3953 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all | 4702 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all |
3954 cytosine context is optional). The output considers all Cs on both forward and reverse strands and | 4703 cytosine context is optional). The output considers all Cs on both forward and reverse strands and |
3981 * Methylated cytosines receive a '+' orientation, | 4730 * Methylated cytosines receive a '+' orientation, |
3982 * Unmethylated cytosines receive a '-' orientation. | 4731 * Unmethylated cytosines receive a '-' orientation. |
3983 | 4732 |
3984 | 4733 |
3985 | 4734 |
3986 The bedGraph output (optional) looks like this (tab-delimited): | 4735 The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords): |
3987 =============================================================== | 4736 ========================================================================================================= |
4737 | |
4738 track type=bedGraph (header line) | |
4739 | |
3988 <chromosome> <start position> <end position> <methylation percentage> | 4740 <chromosome> <start position> <end position> <methylation percentage> |
3989 | 4741 |
3990 The bedGraph output with '--counts' specified looks like this (tab-delimited): | 4742 |
4743 | |
4744 The coverage output looks like this (tab-delimited, 1-based genomic coords): | |
4745 ============================================================================ | |
3991 | 4746 |
3992 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated> | 4747 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated> |
3993 | 4748 |
3994 | 4749 |
3995 | 4750 |
3997 ========================================================================================== | 4752 ========================================================================================== |
3998 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> | 4753 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> |
3999 | 4754 |
4000 | 4755 |
4001 | 4756 |
4002 This script was last modified on 21 April 2013. | 4757 This script was last modified on 25 November 2013. |
4003 | 4758 |
4004 HOW_TO | 4759 HOW_TO |
4005 } | 4760 } |