comparison bismark_methylation_extractor @ 3:91f07ff056ca draft

Uploaded
author bgruening
date Mon, 14 Apr 2014 16:43:14 -0400
parents 62c6da72dd4a
children
comparison
equal deleted inserted replaced
2:82814a8a2395 3:91f07ff056ca
6 use Cwd; 6 use Cwd;
7 use Carp; 7 use Carp;
8 use FindBin qw($Bin); 8 use FindBin qw($Bin);
9 use lib "$Bin/../lib"; 9 use lib "$Bin/../lib";
10 10
11
11 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) 12 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
12 13
13 ## This program is free software: you can redistribute it and/or modify 14 ## This program is free software: you can redistribute it and/or modify
14 ## it under the terms of the GNU General Public License as published by 15 ## it under the terms of the GNU General Public License as published by
15 ## the Free Software Foundation, either version 3 of the License, or 16 ## the Free Software Foundation, either version 3 of the License, or
27 my %counting; 28 my %counting;
28 my $parent_dir = getcwd(); 29 my $parent_dir = getcwd();
29 30
30 my %fhs; 31 my %fhs;
31 32
32 my $version = 'v0.7.11'; 33 my $version = 'v0.10.1';
33 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip) = process_commandline(); 34 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem) = process_commandline();
34 35
35 36
36 ### only needed for bedGraph output 37 ### only needed for bedGraph output
37 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files 38 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files
38 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total 39 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
39 my @bedfiles; 40 my @bedfiles;
40 41
41 ### only needed for genome-wide cytosine methylation report 42 ### only needed for genome-wide cytosine methylation report
42 my %chromosomes; 43 my %chromosomes;
43 44
45 my %mbias_1;
46 my %mbias_2;
47
44 ############################################################################################## 48 ##############################################################################################
45 ### Summarising Run Parameters 49 ### Summarising Run Parameters
46 ############################################################################################## 50 ##############################################################################################
47 51
48 ### METHYLATION EXTRACTOR 52 ### METHYLATION EXTRACTOR
65 else{ 69 else{
66 warn "Bismark paired-end SAM format specified (default)\n"; # default 70 warn "Bismark paired-end SAM format specified (default)\n"; # default
67 } 71 }
68 } 72 }
69 73
70 if ($ignore){ 74 if ($single){
71 warn "First $ignore bases will be disregarded when processing the methylation call string\n"; 75 if ($ignore){
76 warn "First $ignore bp will be disregarded when processing the methylation call string\n";
77 }
72 } 78 }
79 else{ ## paired-end
80 if ($ignore){
81 warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n";
82 }
83 if ($ignore_r2){
84 warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n";
85 }
86 }
87
73 88
74 if ($full){ 89 if ($full){
75 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; 90 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
76 } 91 }
77 if ($merge_non_CpG){ 92 if ($merge_non_CpG){
93 if ($bedGraph){ 108 if ($bedGraph){
94 warn "\n\nSummarising bedGraph parameters:\n"; 109 warn "\n\nSummarising bedGraph parameters:\n";
95 warn '='x63,"\n"; 110 warn '='x63,"\n";
96 111
97 if ($counts){ 112 if ($counts){
98 warn "Generating additional output in bedGraph format including methylating counts (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>)\n"; 113 warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage>\ncoverage format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>\n\n";
99 } 114 }
100 else{ 115 else{
101 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n"; 116 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n";
102 } 117 }
103 118
113 128
114 if ($remove){ 129 if ($remove){
115 warn "White spaces in read ID names will be removed prior to sorting\n"; 130 warn "White spaces in read ID names will be removed prior to sorting\n";
116 } 131 }
117 132
118 if (defined $sort_size){ 133 if ($ample_mem){
134 warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n";
135 }
136 elsif (defined $sort_size){
119 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n"; 137 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n";
120 } 138 }
121 else{ 139 else{
122 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n"; 140 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n";
123 } 141 }
184 total_unmethylated_CHG_count => 0, 202 total_unmethylated_CHG_count => 0,
185 total_unmethylated_CHH_count => 0, 203 total_unmethylated_CHH_count => 0,
186 total_unmethylated_CpG_count => 0, 204 total_unmethylated_CpG_count => 0,
187 sequences_count => 0, 205 sequences_count => 0,
188 ); 206 );
207
189 @sorting_files = (); 208 @sorting_files = ();
190 @bedfiles = (); 209 @bedfiles = ();
210
211 %mbias_1 = ();
212 %mbias_2 = ();
213
214 ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor
215 unless ($vanilla){
216 if ($paired){
217 test_positional_sorting($filename);
218 }
219 }
191 220
192 process_Bismark_results_file($filename); 221 process_Bismark_results_file($filename);
193 222
194 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues 223 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues
195 foreach my $fh (keys %fhs) { 224 foreach my $fh (keys %fhs) {
196 if ($fh =~ /^[1230]$/) { 225 if ($fh =~ /^[1230]$/) {
197 foreach my $context (keys %{$fhs{$fh}}) { 226 foreach my $context (keys %{$fhs{$fh}}) {
198 close $fhs{$fh}->{$context} or die $!; 227 close $fhs{$fh}->{$context} or die $!;
199 228 }
200 } 229 }
201 } else { 230 else{
202 close $fhs{$fh} or die $!; 231 close $fhs{$fh} or die $!;
203 } 232 }
204 } 233 }
234
235 ### printing out all M-Bias data
236 produce_mbias_plots ($filename);
237
238 delete_unused_files();
205 239
206 if ($bedGraph){ 240 if ($bedGraph){
207 241
208 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified 242 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
209 $out =~ s/gz$//; 243 $out =~ s/gz$//;
210 $out =~ s/sam$//; 244 $out =~ s/sam$//;
211 $out =~ s/bam$//; 245 $out =~ s/bam$//;
212 $out =~ s/txt$//; 246 $out =~ s/txt$//;
213 $out =~ s/$/bedGraph/; 247 $out =~ s/$/bedGraph/;
214 248
215
216
217 my $bedGraph_output = $out; 249 my $bedGraph_output = $out;
218 my @args; 250 my @args;
219 251
220 if ($remove){ 252 if ($remove){
221 push @args, '--remove'; 253 push @args, '--remove';
224 push @args, '--CX_context'; 256 push @args, '--CX_context';
225 } 257 }
226 if ($no_header){ 258 if ($no_header){
227 push @args, '--no_header'; 259 push @args, '--no_header';
228 } 260 }
229 if ($counts){ 261 if ($gazillion){
230 push @args, "--counts"; 262 push @args, '--gazillion';
231 } 263 }
264 if ($ample_mem){
265 push @args, '--ample_memory';
266 }
267
268
269 # if ($counts){
270 # push @args, "--counts";
271 # }
232 272
233 push @args, "--buffer_size $sort_size"; 273 push @args, "--buffer_size $sort_size";
234 push @args, "--cutoff $coverage_threshold"; 274 push @args, "--cutoff $coverage_threshold";
235 push @args, "--output $bedGraph_output"; 275 push @args, "--output $bedGraph_output";
236 push @args, "--dir '$output_dir'"; 276 push @args, "--dir '$output_dir'";
252 # process_bedGraph_output(); 292 # process_bedGraph_output();
253 # close OUT or die $!; 293 # close OUT or die $!;
254 294
255 ### genome-wide cytosine methylation report requires bedGraph processing anyway 295 ### genome-wide cytosine methylation report requires bedGraph processing anyway
256 if ($cytosine_report){ 296 if ($cytosine_report){
297
257 @args = (); # resetting @args 298 @args = (); # resetting @args
258 my $cytosine_out = $out; 299 my $cytosine_out = $out;
259 $cytosine_out =~ s/bedGraph$//; 300 $cytosine_out =~ s/bedGraph$//;
260 301
261 if ($CX_context){ 302 if ($CX_context){
278 } 319 }
279 if ($split_by_chromosome){ 320 if ($split_by_chromosome){
280 push @args, '--split_by_chromosome'; 321 push @args, '--split_by_chromosome';
281 } 322 }
282 323
283 push @args, $bedGraph_output; # this will be the infile 324 my $coverage_output = $bedGraph_output;
284 325 $coverage_output =~ s/bedGraph$/bismark.cov/;
285 system ("$Bin/bedGraph2cytosine @args"); 326
327 push @args, $output_dir . $coverage_output; # this will be the infile
328
329 system ("$Bin/coverage2cytosine @args");
286 # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out); 330 # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out);
287 warn "\n\nFinished generating genome-wide cytosine report\n\n"; 331 warn "\n\nFinished generating genome-wide cytosine report\n\n";
288 } 332 }
289 } 333 }
290 } 334 }
291 335
336 sub delete_unused_files{
337
338 warn "Deleting unused files ...\n\n"; sleep(1);
339
340 my $index = 0;
341
342 while ($index <= $#sorting_files){
343 if ($sorting_files[$index] =~ /gz$/){
344 open (USED,"zcat $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
345 }
346 else{
347 open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
348 }
349
350 my $used = 0;
351
352 while (<USED>){
353 next if (/^Bismark/);
354 if ($_){
355 $used = 1;
356 last;
357 }
358 }
359
360 if ($used){
361 warn "$sorting_files[$index] contains data ->\tkept\n";
362 ++$index;
363 }
364 else{
365
366 my $delete = unlink $sorting_files[$index];
367
368 if ($delete){
369 warn "$sorting_files[$index] was empty ->\tdeleted\n";
370 }
371 else{
372 warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n"
373 }
374
375 ### we also need to remove the element from @sorting_files
376 splice @sorting_files, $index, 1;
377 }
378 }
379 warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately)
380 }
381
382 sub produce_mbias_plots{
383
384 my $filename = shift;
385
386 my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
387 $mbias =~ s/gz$//;
388 $mbias =~ s/sam$//;
389 $mbias =~ s/bam$//;
390 $mbias =~ s/txt$//;
391 my $mbias_graph_1 = my $mbias_graph_2 = $mbias;
392 $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png';
393 $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png';
394
395 $mbias =~ s/$/M-bias.txt/;
396
397 open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n";
398
399 # determining maximum read length
400 my $max_length_1 = 0;
401 my $max_length_2 = 0;
402
403 foreach my $context (keys %mbias_1){
404 foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){
405 $max_length_1 = $pos unless ($max_length_1 >= $pos);
406 }
407 }
408 if ($paired){
409 foreach my $context (keys %mbias_2){
410 foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){
411 $max_length_2 = $pos unless ($max_length_2 >= $pos);
412 }
413 }
414 }
415
416 if ($single){
417 warn "Determining maximum read length for M-Bias plot\n";
418 warn "Maximum read length of Read 1: $max_length_1\n\n";
419 }
420 else{
421 warn "Determining maximum read lengths for M-Bias plots\n";
422 warn "Maximum read length of Read 1: $max_length_1\n";
423 warn "Maximum read length of Read 2: $max_length_2\n\n";
424 }
425 # sleep(3);
426
427 my @mbias_read1;
428 my @mbias_read2;
429
430 #Check whether the module GD::Graph:lines is installed
431 my $gd_graph_installed = 0;
432 eval{
433 require GD::Graph::lines;
434 GD::Graph::lines->import();
435 };
436
437 unless($@) { # syntax or routine error variable, set if something goes wron in the last eval{ require ...}
438 $gd_graph_installed = 1;
439
440 #Check whether the module GD::Graph::colour is installed
441 eval{
442 require GD::Graph::colour;
443 GD::Graph::colour->import(qw(:colours :lists :files :convert));
444 };
445
446 if ($@) {
447 warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
448 sleep(2);
449 $gd_graph_installed = 0;
450 }
451
452
453 }
454 else{
455 warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
456 sleep(2);
457 }
458
459
460 my $graph_title;
461 my $graph1;
462 my $graph2;
463
464 if ( $gd_graph_installed){
465 $graph1 = GD::Graph::lines->new(800,600);
466 if ($paired){
467 $graph2 = GD::Graph::lines->new(800,600);
468 }
469 }
470
471 foreach my $context (qw(CpG CHG CHH)){
472 @{$mbias_read1[0]} = ();
473
474 if ($paired){
475 print MBIAS "$context context (R1)\n================\n";
476 $graph_title = 'M-bias (Read 1)';
477 }
478 else{
479 print MBIAS "$context context\n===========\n";
480 $graph_title = 'M-bias';
481 }
482 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
483
484 foreach my $pos (1..$max_length_1){
485
486 unless (defined $mbias_1{$context}->{$pos}->{meth}){
487 $mbias_1{$context}->{$pos}->{meth} = 0;
488 }
489 unless (defined $mbias_1{$context}->{$pos}->{un}){
490 $mbias_1{$context}->{$pos}->{un} = 0;
491 }
492
493 my $percent = '';
494 if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){
495 $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) );
496 }
497 my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth};
498
499 print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n";
500 push @{$mbias_read1[0]},$pos;
501
502 if ($context eq 'CpG'){
503 push @{$mbias_read1[1]},$percent;
504 push @{$mbias_read1[4]},$coverage;
505 }
506 elsif ($context eq 'CHG'){
507 push @{$mbias_read1[2]},$percent;
508 push @{$mbias_read1[5]},$coverage;
509 }
510 elsif ($context eq 'CHH'){
511 push @{$mbias_read1[3]},$percent;
512 push @{$mbias_read1[6]},$coverage;
513 }
514 }
515 print MBIAS "\n";
516 }
517
518 if ( $gd_graph_installed){
519
520 add_colour(nice_blue => [31,120,180]);
521 add_colour(nice_orange => [255,127,0]);
522 add_colour(nice_green => [51,160,44]);
523 add_colour(pale_blue => [153,206,227]);
524 add_colour(pale_orange => [253,204,138]);
525 add_colour(pale_green => [191,230,207]);
526
527 $graph1->set(
528 x_label => 'position (bp)',
529 y1_label => '% methylation',
530 y2_label => '# methylation calls',
531 title => $graph_title,
532 line_width => 2,
533 x_max_value => $max_length_1,
534 x_min_value => 0,
535 y_tick_number => 10,
536 y_label_skip => 2,
537 y1_max_value => 100,
538 y1_min_value => 0,
539 y_label_skip => 2,
540 y2_min_value => 0,
541 x_label_skip => 5,
542 x_label_position => 0.5,
543 x_tick_offset => -1,
544 bgclr => 'white',
545 transparent => 0,
546 two_axes => 1,
547 use_axis => [1,1,1,2,2,2],
548 legend_placement => 'RC',
549 legend_spacing => 6,
550 legend_marker_width => 24,
551 legend_marker_height => 18,
552 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
553 ) or die $graph1->error;
554
555 $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
556
557 my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error;
558
559 open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n";
560 binmode MBIAS_G1;
561 print MBIAS_G1 $gd1->png;
562 }
563
564 if ($paired){
565
566 foreach my $context (qw(CpG CHG CHH)){
567 @{$mbias_read2[0]} = ();
568
569 print MBIAS "$context context (R2)\n================\n";
570 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
571 foreach my $pos (1..$max_length_2){
572
573 unless (defined $mbias_2{$context}->{$pos}->{meth}){
574 $mbias_2{$context}->{$pos}->{meth} = 0;
575 }
576 unless (defined $mbias_2{$context}->{$pos}->{un}){
577 $mbias_2{$context}->{$pos}->{un} = 0;
578 }
579
580 my $percent = '';
581 if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){
582 $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) );
583 }
584 my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth};
585
586 print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n";
587
588 push @{$mbias_read2[0]},$pos;
589
590 if ($context eq 'CpG'){
591 push @{$mbias_read2[1]},$percent;
592 push @{$mbias_read2[4]},$coverage;
593 }
594 elsif ($context eq 'CHG'){
595 push @{$mbias_read2[2]},$percent;
596 push @{$mbias_read2[5]},$coverage;
597 }
598 elsif ($context eq 'CHH'){
599 push @{$mbias_read2[3]},$percent;
600 push @{$mbias_read2[6]},$coverage;
601 }
602 }
603 print MBIAS "\n";
604 }
605
606 if ( $gd_graph_installed){
607
608 add_colour(nice_blue => [31,120,180]);
609 add_colour(nice_orange => [255,127,0]);
610 add_colour(nice_green => [51,160,44]);
611 add_colour(pale_blue => [153,206,227]);
612 add_colour(pale_orange => [253,204,138]);
613 add_colour(pale_green => [191,230,207]);
614
615 $graph2->set(
616 x_label => 'position (bp)',
617 line_width => 2,
618 x_max_value => $max_length_1,
619 x_min_value => 0,
620 y_tick_number => 10,
621 y_label_skip => 2,
622 y1_max_value => 100,
623 y1_min_value => 0,
624 y_label_skip => 2,
625 y2_min_value => 0,
626 x_label_skip => 5,
627 x_label_position => 0.5,
628 x_tick_offset => -1,
629 bgclr => 'white',
630 transparent => 0,
631 two_axes => 1,
632 use_axis => [1,1,1,2,2,2],
633 legend_placement => 'RC',
634 legend_spacing => 6,
635 legend_marker_width => 24,
636 legend_marker_height => 18,
637 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
638 x_label => 'position (bp)',
639 y1_label => '% methylation',
640 y2_label => '# calls',
641 title => 'M-bias (Read 2)',
642 ) or die $graph2->error;
643
644 $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
645 my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error;
646
647 open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n";
648 binmode MBIAS_G2;
649 print MBIAS_G2 $gd2->png;
650
651 }
652 }
653 }
292 654
293 sub process_commandline{ 655 sub process_commandline{
294 my $help; 656 my $help;
295 my $single_end; 657 my $single_end;
296 my $paired_end; 658 my $paired_end;
297 my $ignore; 659 my $ignore;
660 my $ignore_r2;
298 my $genomic_fasta; 661 my $genomic_fasta;
299 my $full; 662 my $full;
300 my $report; 663 my $report;
301 my $extractor_version; 664 my $extractor_version;
302 my $no_overlap; 665 my $no_overlap;
315 my $CX_context; 678 my $CX_context;
316 my $split_by_chromosome; 679 my $split_by_chromosome;
317 my $sort_size; 680 my $sort_size;
318 my $samtools_path; 681 my $samtools_path;
319 my $gzip; 682 my $gzip;
320 683 my $mbias_only;
321 my $command_line = GetOptions ('help|man' => \$help, 684 my $gazillion;
322 'p|paired-end' => \$paired_end, 685 my $ample_mem;
323 's|single-end' => \$single_end, 686
324 'fasta' => \$genomic_fasta, 687 my $command_line = GetOptions ('help|man' => \$help,
325 'ignore=i' => \$ignore, 688 'p|paired-end' => \$paired_end,
326 'comprehensive' => \$full, 689 's|single-end' => \$single_end,
327 'report' => \$report, 690 'fasta' => \$genomic_fasta,
328 'version' => \$extractor_version, 691 'ignore=i' => \$ignore,
329 'no_overlap' => \$no_overlap, 692 'ignore_r2=i' => \$ignore_r2,
330 'merge_non_CpG' => \$merge_non_CpG, 693 'comprehensive' => \$full,
331 'vanilla' => \$vanilla, 694 'report' => \$report,
332 'o|output=s' => \$output_dir, 695 'version' => \$extractor_version,
333 'no_header' => \$no_header, 696 'no_overlap' => \$no_overlap,
334 'bedGraph' => \$bedGraph, 697 'merge_non_CpG' => \$merge_non_CpG,
335 "cutoff=i" => \$coverage_threshold, 698 'vanilla' => \$vanilla,
336 "remove_spaces" => \$remove, 699 'o|output=s' => \$output_dir,
337 "counts" => \$counts, 700 'no_header' => \$no_header,
338 "cytosine_report" => \$cytosine_report, 701 'bedGraph' => \$bedGraph,
339 'g|genome_folder=s' => \$genome_folder, 702 "cutoff=i" => \$coverage_threshold,
340 "zero_based" => \$zero, 703 "remove_spaces" => \$remove,
341 "CX|CX_context" => \$CX_context, 704 "counts" => \$counts,
342 "split_by_chromosome" => \$split_by_chromosome, 705 "cytosine_report" => \$cytosine_report,
343 "buffer_size=s" => \$sort_size, 706 'g|genome_folder=s' => \$genome_folder,
344 'samtools_path=s' => \$samtools_path, 707 "zero_based" => \$zero,
345 "gzip" => \$gzip, 708 "CX|CX_context" => \$CX_context,
346 ); 709 "split_by_chromosome" => \$split_by_chromosome,
710 "buffer_size=s" => \$sort_size,
711 'samtools_path=s' => \$samtools_path,
712 "gzip" => \$gzip,
713 "mbias_only" => \$mbias_only,
714 "gazillion|scaffolds" => \$gazillion,
715 "ample_memory" => \$ample_mem,
716 );
347 717
348 ### EXIT ON ERROR if there were errors with any of the supplied options 718 ### EXIT ON ERROR if there were errors with any of the supplied options
349 unless ($command_line){ 719 unless ($command_line){
350 die "Please respecify command line options\n"; 720 die "Please respecify command line options\n";
351 } 721 }
378 } 748 }
379 @filenames = @ARGV; 749 @filenames = @ARGV;
380 750
381 warn "\n *** Bismark methylation extractor version $version ***\n\n"; 751 warn "\n *** Bismark methylation extractor version $version ***\n\n";
382 752
383 ### IGNORING <INT> bases at the start of the read when processing the methylation call string 753 ### M-BIAS ONLY
384 unless ($ignore){ 754 if ($mbias_only){
385 $ignore = 0; 755 if ($bedGraph){
386 } 756 die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n";
757 }
758 if ($cytosine_report){
759 die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n";
760 }
761 if ($merge_non_CpG){
762 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n";
763 }
764 if ($full){
765 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n";
766 }
767 sleep(3);
768 }
769
387 ### PRINT A REPORT 770 ### PRINT A REPORT
388 unless ($report){ 771 unless ($report){
389 $report = 0; 772 $report = 0;
390 } 773 }
391 774
414 } 797 }
415 elsif ($paired_end){ 798 elsif ($paired_end){
416 $single_end = 0; ### PAIRED-END ALIGNMENTS 799 $single_end = 0; ### PAIRED-END ALIGNMENTS
417 } 800 }
418 else{ 801 else{
419 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n"; 802
420 } 803 ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header
804
805 if ($vanilla){
806 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n";
807 }
808 else{ # SAM/BAM format
809
810 my $file = $filenames[0];
811 warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1);
812
813 ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file
814 if ($file =~ /\.gz$/){
815 open (DETERMINE,"zcat $file |") or die "Unable to read from gzipped file $file: $!\n";
816 }
817 elsif ($file =~ /\.bam$/ || `file -b $file` =~ /^gzip/){
818 open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM file $file: $!\n";
819 }
820 else{
821 open (DETERMINE,$file) or die "Unable to read from $file: $!\n";
822 }
823
824 while (<DETERMINE>){
825 last unless (/^\@/);
826 if ($_ =~ /^\@PG/){
827 # warn "found the \@PG line:\n";
828 # warn "$_";
829
830 if ($_ =~ /-1/ and $_ =~ /-2/){
831 warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1);
832 $paired_end = 1;
833 $single_end = 0;
834 }
835 else{
836 warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1);
837 $paired_end = 0;
838 $single_end = 1;
839 }
840 }
841 }
842
843 close DETERMINE or warn $!;
844
845 }
846 }
847
848 ### IGNORING <INT> bases at the start of the read when processing the methylation call string
849 unless ($ignore){
850 $ignore = 0;
851 }
852
853 if (defined $ignore_r2){
854 die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end);
855 }
856 else{
857 $ignore_r2 = 0;
858 }
859
421 860
422 ### NO OVERLAP 861 ### NO OVERLAP
423 if ($no_overlap){ 862 if ($no_overlap){
424 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end); 863 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
425 } 864 }
472 else{ 911 else{
473 $CX_context = 0; 912 $CX_context = 0;
474 } 913 }
475 914
476 unless ($counts){ 915 unless ($counts){
477 $counts = 0; 916 $counts = 1; # counts will always be set
478 } 917 }
479 918
480 if ($cytosine_report){ 919 if ($cytosine_report){
481 920
482 ### GENOME folder 921 ### GENOME folder
492 unless ($bedGraph){ 931 unless ($bedGraph){
493 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; 932 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n";
494 $bedGraph = 1; 933 $bedGraph = 1;
495 } 934 }
496 unless ($counts){ 935 unless ($counts){
497 warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; 936 # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n";
498 $counts = 1; 937 $counts = 1;
499 } 938 }
500 warn "\n"; 939 warn "\n";
501 } 940 }
502 941
534 973
535 unless (defined $samtools_path){ 974 unless (defined $samtools_path){
536 $samtools_path = ''; 975 $samtools_path = '';
537 } 976 }
538 977
539 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip); 978
979 if ($gazillion){
980 if ($ample_mem){
981 die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n";
982 }
983 }
984
985 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem);
986 }
987
988
989 sub test_positional_sorting{
990
991 my $filename = shift;
992
993 print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t";
994 sleep(1);
995
996 if ($filename =~ /\.gz$/) {
997 open (TEST,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
998 }
999 elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) {
1000 if ($samtools_path){
1001 open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
1002 }
1003 else{
1004 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
1005 }
1006 }
1007 else {
1008 open (TEST,$filename) or die "Can't open file $filename: $!\n";
1009 }
1010
1011 my $count = 0;
1012
1013 while (<TEST>) {
1014 if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted)
1015 if (/^\@SO/) {
1016 die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n";
1017 }
1018 next;
1019 }
1020 $count++;
1021
1022 last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID
1023
1024 my ($id_1) = (split (/\t/));
1025
1026 ### reading the next line which should be read 2
1027 $_ = <TEST>;
1028 my ($id_2) = (split (/\t/));
1029 last unless ($id_2);
1030 ++$count;
1031
1032 if ($id_1 eq $id_2){
1033 ### ids are the same
1034 next;
1035 }
1036 else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first
1037 my $id_1_trunc = $id_1;
1038 $id_1_trunc =~ s/\/1$//;
1039 my $id_2_trunc = $id_2;
1040 $id_2_trunc =~ s/\/2$//;
1041
1042 unless ($id_1_trunc eq $id_2_trunc){
1043 die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n";
1044 }
1045 }
1046 }
1047 # close TEST or die $!; somehow fails on our cluster...
1048 ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other)
1049 warn "...passed!\n";
1050 sleep(1);
1051
540 } 1052 }
541 1053
542 1054
543 sub process_Bismark_results_file{ 1055 sub process_Bismark_results_file{
544 my $filename = shift; 1056 my $filename = shift;
546 warn "\nNow reading in Bismark result file $filename\n\n"; 1058 warn "\nNow reading in Bismark result file $filename\n\n";
547 1059
548 if ($filename =~ /\.gz$/) { 1060 if ($filename =~ /\.gz$/) {
549 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; 1061 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
550 } 1062 }
551 elsif ($filename =~ /bam$/) { 1063 elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) {
552 if ($samtools_path){ 1064 if ($samtools_path){
553 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; 1065 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
554 } 1066 }
555 else{ 1067 else{
556 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; 1068 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
607 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; 1119 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
608 } else { 1120 } else {
609 print REPORT "Bismark result file: single-end (SAM format)\n"; # default 1121 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
610 } 1122 }
611 } 1123 }
612 1124 if ($single){
613 if ($ignore) { 1125 if ($ignore) {
614 print REPORT "Ignoring first $ignore bases\n"; 1126 print REPORT "Ignoring first $ignore bp\n";
1127 }
1128 }
1129 else{ # paired-end
1130 if ($ignore) {
1131 print REPORT "Ignoring first $ignore bp of Read 1\n";
1132 }
1133 if ($ignore_r2){
1134 print REPORT "Ignoring first $ignore_r2 bp of Read 2\n";
1135 }
615 } 1136 }
616 1137
617 if ($full) { 1138 if ($full) {
618 print REPORT "Output specified: comprehensive\n"; 1139 print REPORT "Output specified: comprehensive\n";
619 } else { 1140 } else {
646 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); 1167 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
647 $cpg_output = $output_dir . $cpg_output; 1168 $cpg_output = $output_dir . $cpg_output;
648 1169
649 if ($gzip){ 1170 if ($gzip){
650 $cpg_output .= '.gz'; 1171 $cpg_output .= '.gz';
651 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n"; 1172 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
652 } 1173 }
653 else{ 1174 else{
654 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; 1175 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
655 } 1176 }
656 1177
657 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; 1178 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
658 push @sorting_files,$cpg_output; 1179 push @sorting_files,$cpg_output;
659 1180
660 unless ($no_header) { 1181 unless ($no_header) {
661 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; 1182 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
662 } 1183 }
663 1184
664 ### C in any other context than CpG 1185 ### C in any other context than CpG
665 $other_c_output =~ s/^/Non_CpG_context_/; 1186 $other_c_output =~ s/^/Non_CpG_context_/;
666 $other_c_output =~ s/sam$/txt/; 1187 $other_c_output =~ s/sam$/txt/;
668 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); 1189 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
669 $other_c_output = $output_dir . $other_c_output; 1190 $other_c_output = $output_dir . $other_c_output;
670 1191
671 if ($gzip){ 1192 if ($gzip){
672 $other_c_output .= '.gz'; 1193 $other_c_output .= '.gz';
673 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n"; 1194 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only);
674 } 1195 }
675 else{ 1196 else{
676 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n"; 1197 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only);
677 } 1198 }
678 1199
679 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n"; 1200 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only);
680 push @sorting_files,$other_c_output; 1201 push @sorting_files,$other_c_output;
681 1202
682 1203
683 unless ($no_header) { 1204 unless ($no_header) {
684 print {$fhs{other_context}} "Bismark methylation extractor version $version\n"; 1205 print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
685 } 1206 }
686 } 1207 }
687 1208
688 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found 1209 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
689 elsif ($merge_non_CpG) { 1210 elsif ($merge_non_CpG) {
697 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); 1218 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
698 $cpg_ot = $output_dir . $cpg_ot; 1219 $cpg_ot = $output_dir . $cpg_ot;
699 1220
700 if ($gzip){ 1221 if ($gzip){
701 $cpg_ot .= '.gz'; 1222 $cpg_ot .= '.gz';
702 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n"; 1223 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
703 } 1224 }
704 else{ 1225 else{
705 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; 1226 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
706 } 1227 }
707 1228
708 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; 1229 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
709 push @sorting_files,$cpg_ot; 1230 push @sorting_files,$cpg_ot;
710 1231
711 unless($no_header){ 1232 unless($no_header){
712 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; 1233 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
713 } 1234 }
714 1235
715 $cpg_ctot =~ s/^/CpG_CTOT_/; 1236 $cpg_ctot =~ s/^/CpG_CTOT_/;
716 $cpg_ctot =~ s/sam$/txt/; 1237 $cpg_ctot =~ s/sam$/txt/;
717 $cpg_ctot =~ s/bam$/txt/; 1238 $cpg_ctot =~ s/bam$/txt/;
718 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); 1239 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
719 $cpg_ctot = $output_dir . $cpg_ctot; 1240 $cpg_ctot = $output_dir . $cpg_ctot;
720 1241
721 if ($gzip){ 1242 if ($gzip){
722 $cpg_ctot .= '.gz'; 1243 $cpg_ctot .= '.gz';
723 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n"; 1244 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
724 } 1245 }
725 else{ 1246 else{
726 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; 1247 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
727 } 1248 }
728 1249
729 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; 1250 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
730 push @sorting_files,$cpg_ctot; 1251 push @sorting_files,$cpg_ctot;
731 1252
732 unless($no_header){ 1253 unless($no_header){
733 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; 1254 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
734 } 1255 }
735 1256
736 $cpg_ctob =~ s/^/CpG_CTOB_/; 1257 $cpg_ctob =~ s/^/CpG_CTOB_/;
737 $cpg_ctob =~ s/sam$/txt/; 1258 $cpg_ctob =~ s/sam$/txt/;
738 $cpg_ctob =~ s/bam$/txt/; 1259 $cpg_ctob =~ s/bam$/txt/;
739 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); 1260 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
740 $cpg_ctob = $output_dir . $cpg_ctob; 1261 $cpg_ctob = $output_dir . $cpg_ctob;
741 1262
742 if ($gzip){ 1263 if ($gzip){
743 $cpg_ctob .= '.gz'; 1264 $cpg_ctob .= '.gz';
744 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n"; 1265 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
745 } 1266 }
746 else{ 1267 else{
747 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; 1268 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
748 } 1269 }
749 1270
750 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; 1271 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
751 push @sorting_files,$cpg_ctob; 1272 push @sorting_files,$cpg_ctob;
752 1273
753 unless($no_header){ 1274 unless($no_header){
754 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; 1275 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
755 } 1276 }
756 1277
757 $cpg_ob =~ s/^/CpG_OB_/; 1278 $cpg_ob =~ s/^/CpG_OB_/;
758 $cpg_ob =~ s/sam$/txt/; 1279 $cpg_ob =~ s/sam$/txt/;
759 $cpg_ob =~ s/bam$/txt/; 1280 $cpg_ob =~ s/bam$/txt/;
760 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); 1281 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
761 $cpg_ob = $output_dir . $cpg_ob; 1282 $cpg_ob = $output_dir . $cpg_ob;
762 1283
763 if ($gzip){ 1284 if ($gzip){
764 $cpg_ob .= '.gz'; 1285 $cpg_ob .= '.gz';
765 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n"; 1286 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
766 } 1287 }
767 else{ 1288 else{
768 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; 1289 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
769 } 1290 }
770 1291
771 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; 1292 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
772 push @sorting_files,$cpg_ob; 1293 push @sorting_files,$cpg_ob;
773 1294
774 unless($no_header){ 1295 unless($no_header){
775 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; 1296 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
776 } 1297 }
777 1298
778 ### For cytosines in Non-CpG (CC, CT or CA) context 1299 ### For cytosines in Non-CpG (CC, CT or CA) context
779 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; 1300 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
780 1301
784 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); 1305 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
785 $other_c_ot = $output_dir . $other_c_ot; 1306 $other_c_ot = $output_dir . $other_c_ot;
786 1307
787 if ($gzip){ 1308 if ($gzip){
788 $other_c_ot .= '.gz'; 1309 $other_c_ot .= '.gz';
789 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n"; 1310 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
790 } 1311 }
791 else{ 1312 else{
792 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n"; 1313 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
793 } 1314 }
794 1315
795 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n"; 1316 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only);
796 push @sorting_files,$other_c_ot; 1317 push @sorting_files,$other_c_ot;
797 1318
798 unless($no_header){ 1319 unless($no_header){
799 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n"; 1320 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
800 } 1321 }
801 1322
802 $other_c_ctot =~ s/^/Non_CpG_CTOT_/; 1323 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
803 $other_c_ctot =~ s/sam$/txt/; 1324 $other_c_ctot =~ s/sam$/txt/;
804 $other_c_ctot =~ s/bam$/txt/; 1325 $other_c_ctot =~ s/bam$/txt/;
805 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); 1326 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
806 $other_c_ctot = $output_dir . $other_c_ctot; 1327 $other_c_ctot = $output_dir . $other_c_ctot;
807 1328
808 if ($gzip){ 1329 if ($gzip){
809 $other_c_ctot .= '.gz'; 1330 $other_c_ctot .= '.gz';
810 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n"; 1331 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
811 } 1332 }
812 else{ 1333 else{
813 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n"; 1334 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
814 } 1335 }
815 1336
816 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n"; 1337 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only);
817 push @sorting_files,$other_c_ctot; 1338 push @sorting_files,$other_c_ctot;
818 1339
819 unless($no_header){ 1340 unless($no_header){
820 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n"; 1341 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
821 } 1342 }
822 1343
823 $other_c_ctob =~ s/^/Non_CpG_CTOB_/; 1344 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
824 $other_c_ctob =~ s/sam$/txt/; 1345 $other_c_ctob =~ s/sam$/txt/;
825 $other_c_ctob =~ s/bam$/txt/; 1346 $other_c_ctob =~ s/bam$/txt/;
826 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); 1347 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
827 $other_c_ctob = $output_dir . $other_c_ctob; 1348 $other_c_ctob = $output_dir . $other_c_ctob;
828 1349
829 if ($gzip){ 1350 if ($gzip){
830 $other_c_ctob .= '.gz'; 1351 $other_c_ctob .= '.gz';
831 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n"; 1352 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
832 } 1353 }
833 else{ 1354 else{
834 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n"; 1355 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
835 } 1356 }
836 1357
837 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n"; 1358 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only);
838 push @sorting_files,$other_c_ctob; 1359 push @sorting_files,$other_c_ctob;
839 1360
840 unless($no_header){ 1361 unless($no_header){
841 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n"; 1362 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
842 } 1363 }
843 1364
844 $other_c_ob =~ s/^/Non_CpG_OB_/; 1365 $other_c_ob =~ s/^/Non_CpG_OB_/;
845 $other_c_ob =~ s/sam$/txt/; 1366 $other_c_ob =~ s/sam$/txt/;
846 $other_c_ob =~ s/sam$/txt/; 1367 $other_c_ob =~ s/sam$/txt/;
847 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); 1368 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
848 $other_c_ob = $output_dir . $other_c_ob; 1369 $other_c_ob = $output_dir . $other_c_ob;
849 1370
850 if ($gzip){ 1371 if ($gzip){
851 $other_c_ob .= '.gz'; 1372 $other_c_ob .= '.gz';
852 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n"; 1373 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
853 } 1374 }
854 else{ 1375 else{
855 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n"; 1376 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
856 } 1377 }
857 1378
858 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n"; 1379 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only);
859 push @sorting_files,$other_c_ob; 1380 push @sorting_files,$other_c_ob;
860 1381
861 unless($no_header){ 1382 unless($no_header){
862 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n"; 1383 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
863 } 1384 }
864 } 1385 }
865 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) 1386 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
866 1387
867 ### if --comprehensive was specified we are only writing one file per context 1388 ### if --comprehensive was specified we are only writing one file per context
874 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); 1395 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
875 $cpg_output = $output_dir . $cpg_output; 1396 $cpg_output = $output_dir . $cpg_output;
876 1397
877 if ($gzip){ 1398 if ($gzip){
878 $cpg_output .= '.gz'; 1399 $cpg_output .= '.gz';
879 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n"; 1400 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
880 } 1401 }
881 else{ 1402 else{
882 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n"; 1403 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
883 } 1404 }
884 1405
885 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n"; 1406 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
886 push @sorting_files,$cpg_output; 1407 push @sorting_files,$cpg_output;
887 1408
888 unless($no_header){ 1409 unless($no_header){
889 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n"; 1410 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
890 } 1411 }
891 1412
892 ### C in CHG context 1413 ### C in CHG context
893 $chg_output =~ s/^/CHG_context_/; 1414 $chg_output =~ s/^/CHG_context_/;
894 $chg_output =~ s/sam$/txt/; 1415 $chg_output =~ s/sam$/txt/;
896 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); 1417 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
897 $chg_output = $output_dir . $chg_output; 1418 $chg_output = $output_dir . $chg_output;
898 1419
899 if ($gzip){ 1420 if ($gzip){
900 $chg_output .= '.gz'; 1421 $chg_output .= '.gz';
901 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n"; 1422 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only);
902 } 1423 }
903 else{ 1424 else{
904 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n"; 1425 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only);
905 } 1426 }
906 1427
907 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n"; 1428 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only);
908 push @sorting_files,$chg_output; 1429 push @sorting_files,$chg_output;
909 1430
910 unless($no_header){ 1431 unless($no_header){
911 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n"; 1432 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
912 } 1433 }
913 1434
914 ### C in CHH context 1435 ### C in CHH context
915 $chh_output =~ s/^/CHH_context_/; 1436 $chh_output =~ s/^/CHH_context_/;
916 $chh_output =~ s/sam$/txt/; 1437 $chh_output =~ s/sam$/txt/;
918 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); 1439 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
919 $chh_output = $output_dir . $chh_output; 1440 $chh_output = $output_dir . $chh_output;
920 1441
921 if ($gzip){ 1442 if ($gzip){
922 $chh_output .= '.gz'; 1443 $chh_output .= '.gz';
923 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n"; 1444 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only);
924 } 1445 }
925 else{ 1446 else{
926 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n"; 1447 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only);
927 } 1448 }
928 1449
929 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n"; 1450 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only);
930 push @sorting_files, $chh_output; 1451 push @sorting_files, $chh_output;
931 1452
932 unless($no_header){ 1453 unless($no_header){
933 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n"; 1454 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
934 } 1455 }
935 } 1456 }
936 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found 1457 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
937 else { 1458 else {
938 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; 1459 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
944 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); 1465 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
945 $cpg_ot = $output_dir . $cpg_ot; 1466 $cpg_ot = $output_dir . $cpg_ot;
946 1467
947 if ($gzip){ 1468 if ($gzip){
948 $cpg_ot .= '.gz'; 1469 $cpg_ot .= '.gz';
949 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n"; 1470 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
950 } 1471 }
951 else{ 1472 else{
952 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n"; 1473 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
953 } 1474 }
954 1475
955 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n"; 1476 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
956 push @sorting_files,$cpg_ot; 1477 push @sorting_files,$cpg_ot;
957 1478
958 unless($no_header){ 1479 unless($no_header){
959 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n"; 1480 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
960 } 1481 }
961 1482
962 $cpg_ctot =~ s/^/CpG_CTOT_/; 1483 $cpg_ctot =~ s/^/CpG_CTOT_/;
963 $cpg_ctot =~ s/sam$/txt/; 1484 $cpg_ctot =~ s/sam$/txt/;
964 $cpg_ctot =~ s/bam$/txt/; 1485 $cpg_ctot =~ s/bam$/txt/;
965 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); 1486 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
966 $cpg_ctot = $output_dir . $cpg_ctot; 1487 $cpg_ctot = $output_dir . $cpg_ctot;
967 1488
968 if ($gzip){ 1489 if ($gzip){
969 $cpg_ctot .= '.gz'; 1490 $cpg_ctot .= '.gz';
970 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n"; 1491 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
971 } 1492 }
972 else{ 1493 else{
973 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n"; 1494 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
974 } 1495 }
975 1496
976 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n"; 1497 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
977 push @sorting_files,$cpg_ctot; 1498 push @sorting_files,$cpg_ctot;
978 1499
979 unless($no_header){ 1500 unless($no_header){
980 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n"; 1501 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
981 } 1502 }
982 1503
983 $cpg_ctob =~ s/^/CpG_CTOB_/; 1504 $cpg_ctob =~ s/^/CpG_CTOB_/;
984 $cpg_ctob =~ s/sam$/txt/; 1505 $cpg_ctob =~ s/sam$/txt/;
985 $cpg_ctob =~ s/bam$/txt/; 1506 $cpg_ctob =~ s/bam$/txt/;
986 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); 1507 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
987 $cpg_ctob = $output_dir . $cpg_ctob; 1508 $cpg_ctob = $output_dir . $cpg_ctob;
988 1509
989 if ($gzip){ 1510 if ($gzip){
990 $cpg_ctob .= '.gz'; 1511 $cpg_ctob .= '.gz';
991 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n"; 1512 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
992 } 1513 }
993 else{ 1514 else{
994 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n"; 1515 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
995 } 1516 }
996 1517
997 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n"; 1518 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
998 push @sorting_files,$cpg_ctob; 1519 push @sorting_files,$cpg_ctob;
999 1520
1000 unless($no_header){ 1521 unless($no_header){
1001 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n"; 1522 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1002 } 1523 }
1003 1524
1004 $cpg_ob =~ s/^/CpG_OB_/; 1525 $cpg_ob =~ s/^/CpG_OB_/;
1005 $cpg_ob =~ s/sam$/txt/; 1526 $cpg_ob =~ s/sam$/txt/;
1006 $cpg_ob =~ s/bam$/txt/; 1527 $cpg_ob =~ s/bam$/txt/;
1007 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); 1528 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
1008 $cpg_ob = $output_dir . $cpg_ob; 1529 $cpg_ob = $output_dir . $cpg_ob;
1009 1530
1010 if ($gzip){ 1531 if ($gzip){
1011 $cpg_ob .= '.gz'; 1532 $cpg_ob .= '.gz';
1012 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n"; 1533 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
1013 } 1534 }
1014 else{ 1535 else{
1015 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n"; 1536 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
1016 } 1537 }
1017 1538
1018 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n"; 1539 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
1019 push @sorting_files,$cpg_ob; 1540 push @sorting_files,$cpg_ob;
1020 1541
1021 unless($no_header){ 1542 unless($no_header){
1022 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n"; 1543 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1023 } 1544 }
1024 1545
1025 ### For cytosines in CHG context 1546 ### For cytosines in CHG context
1026 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; 1547 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
1027 1548
1031 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); 1552 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
1032 $chg_ot = $output_dir . $chg_ot; 1553 $chg_ot = $output_dir . $chg_ot;
1033 1554
1034 if ($gzip){ 1555 if ($gzip){
1035 $chg_ot .= '.gz'; 1556 $chg_ot .= '.gz';
1036 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n"; 1557 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
1037 } 1558 }
1038 else{ 1559 else{
1039 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n"; 1560 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
1040 } 1561 }
1041 1562
1042 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n"; 1563 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only);
1043 push @sorting_files,$chg_ot; 1564 push @sorting_files,$chg_ot;
1044 1565
1045 unless($no_header){ 1566 unless($no_header){
1046 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n"; 1567 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1047 } 1568 }
1048 1569
1049 $chg_ctot =~ s/^/CHG_CTOT_/; 1570 $chg_ctot =~ s/^/CHG_CTOT_/;
1050 $chg_ctot =~ s/sam$/txt/; 1571 $chg_ctot =~ s/sam$/txt/;
1051 $chg_ctot =~ s/bam$/txt/; 1572 $chg_ctot =~ s/bam$/txt/;
1052 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); 1573 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
1053 $chg_ctot = $output_dir . $chg_ctot; 1574 $chg_ctot = $output_dir . $chg_ctot;
1054 1575
1055 if ($gzip){ 1576 if ($gzip){
1056 $chg_ctot .= '.gz'; 1577 $chg_ctot .= '.gz';
1057 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n"; 1578 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
1058 } 1579 }
1059 else{ 1580 else{
1060 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n"; 1581 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
1061 } 1582 }
1062 1583
1063 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n"; 1584 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only);
1064 push @sorting_files,$chg_ctot; 1585 push @sorting_files,$chg_ctot;
1065 1586
1066 unless($no_header){ 1587 unless($no_header){
1067 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n"; 1588 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1068 } 1589 }
1069 1590
1070 $chg_ctob =~ s/^/CHG_CTOB_/; 1591 $chg_ctob =~ s/^/CHG_CTOB_/;
1071 $chg_ctob =~ s/sam$/txt/; 1592 $chg_ctob =~ s/sam$/txt/;
1072 $chg_ctob =~ s/bam$/txt/; 1593 $chg_ctob =~ s/bam$/txt/;
1073 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); 1594 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
1074 $chg_ctob = $output_dir . $chg_ctob; 1595 $chg_ctob = $output_dir . $chg_ctob;
1075 1596
1076 if ($gzip){ 1597 if ($gzip){
1077 $chg_ctob .= '.gz'; 1598 $chg_ctob .= '.gz';
1078 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n"; 1599 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
1079 } 1600 }
1080 else{ 1601 else{
1081 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n"; 1602 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
1082 } 1603 }
1083 1604
1084 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n"; 1605 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only);
1085 push @sorting_files,$chg_ctob; 1606 push @sorting_files,$chg_ctob;
1086 1607
1087 unless($no_header){ 1608 unless($no_header){
1088 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n"; 1609 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1089 } 1610 }
1090 1611
1091 $chg_ob =~ s/^/CHG_OB_/; 1612 $chg_ob =~ s/^/CHG_OB_/;
1092 $chg_ob =~ s/sam$/txt/; 1613 $chg_ob =~ s/sam$/txt/;
1093 $chg_ob =~ s/bam$/txt/; 1614 $chg_ob =~ s/bam$/txt/;
1094 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); 1615 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
1095 $chg_ob = $output_dir . $chg_ob; 1616 $chg_ob = $output_dir . $chg_ob;
1096 1617
1097 if ($gzip){ 1618 if ($gzip){
1098 $chg_ob .= '.gz'; 1619 $chg_ob .= '.gz';
1099 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n"; 1620 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
1100 } 1621 }
1101 else{ 1622 else{
1102 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n"; 1623 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
1103 } 1624 }
1104 1625
1105 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n"; 1626 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only);
1106 push @sorting_files,$chg_ob; 1627 push @sorting_files,$chg_ob;
1107 1628
1108 unless($no_header){ 1629 unless($no_header){
1109 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n"; 1630 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1110 } 1631 }
1111 1632
1112 ### For cytosines in CHH context 1633 ### For cytosines in CHH context
1113 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; 1634 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
1114 1635
1118 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); 1639 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
1119 $chh_ot = $output_dir . $chh_ot; 1640 $chh_ot = $output_dir . $chh_ot;
1120 1641
1121 if ($gzip){ 1642 if ($gzip){
1122 $chh_ot .= '.gz'; 1643 $chh_ot .= '.gz';
1123 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n"; 1644 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
1124 } 1645 }
1125 else{ 1646 else{
1126 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n"; 1647 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
1127 } 1648 }
1128 1649
1129 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n"; 1650 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only);
1130 push @sorting_files,$chh_ot; 1651 push @sorting_files,$chh_ot;
1131 1652
1132 unless($no_header){ 1653 unless($no_header){
1133 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n"; 1654 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1134 } 1655 }
1135 1656
1136 $chh_ctot =~ s/^/CHH_CTOT_/; 1657 $chh_ctot =~ s/^/CHH_CTOT_/;
1137 $chh_ctot =~ s/sam$/txt/; 1658 $chh_ctot =~ s/sam$/txt/;
1138 $chh_ctot =~ s/bam$/txt/; 1659 $chh_ctot =~ s/bam$/txt/;
1139 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); 1660 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
1140 $chh_ctot = $output_dir . $chh_ctot; 1661 $chh_ctot = $output_dir . $chh_ctot;
1141 1662
1142 if ($gzip){ 1663 if ($gzip){
1143 $chh_ctot .= '.gz'; 1664 $chh_ctot .= '.gz';
1144 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n"; 1665 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
1145 } 1666 }
1146 else{ 1667 else{
1147 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n"; 1668 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
1148 } 1669 }
1149 1670
1150 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n"; 1671 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only);
1151 push @sorting_files,$chh_ctot; 1672 push @sorting_files,$chh_ctot;
1152 1673
1153 unless($no_header){ 1674 unless($no_header){
1154 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n"; 1675 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1155 } 1676 }
1156 1677
1157 $chh_ctob =~ s/^/CHH_CTOB_/; 1678 $chh_ctob =~ s/^/CHH_CTOB_/;
1158 $chh_ctob =~ s/sam$/txt/; 1679 $chh_ctob =~ s/sam$/txt/;
1159 $chh_ctob =~ s/bam$/txt/; 1680 $chh_ctob =~ s/bam$/txt/;
1160 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); 1681 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
1161 $chh_ctob = $output_dir . $chh_ctob; 1682 $chh_ctob = $output_dir . $chh_ctob;
1162 1683
1163 if ($gzip){ 1684 if ($gzip){
1164 $chh_ctob .= '.gz'; 1685 $chh_ctob .= '.gz';
1165 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n"; 1686 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
1166 } 1687 }
1167 else{ 1688 else{
1168 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n"; 1689 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
1169 } 1690 }
1170 1691
1171 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n"; 1692 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only);
1172 push @sorting_files,$chh_ctob; 1693 push @sorting_files,$chh_ctob;
1173 1694
1174 unless($no_header){ 1695 unless($no_header){
1175 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n"; 1696 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1176 } 1697 }
1177 1698
1178 $chh_ob =~ s/^/CHH_OB_/; 1699 $chh_ob =~ s/^/CHH_OB_/;
1179 $chh_ob =~ s/sam$/txt/; 1700 $chh_ob =~ s/sam$/txt/;
1180 $chh_ob =~ s/bam$/txt/; 1701 $chh_ob =~ s/bam$/txt/;
1181 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); 1702 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
1182 $chh_ob = $output_dir . $chh_ob; 1703 $chh_ob = $output_dir . $chh_ob;
1183 1704
1184 if ($gzip){ 1705 if ($gzip){
1185 $chh_ob .= '.gz'; 1706 $chh_ob .= '.gz';
1186 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n"; 1707 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
1187 } 1708 }
1188 else{ 1709 else{
1189 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n"; 1710 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
1190 } 1711 }
1191 1712
1192 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n"; 1713 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only);
1193 push @sorting_files,$chh_ob; 1714 push @sorting_files,$chh_ob;
1194 1715
1195 unless($no_header){ 1716 unless($no_header){
1196 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n"; 1717 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1197 } 1718 }
1198 } 1719 }
1199 1720
1200 my $methylation_call_strings_processed = 0; 1721 my $methylation_call_strings_processed = 0;
1201 my $line_count = 0; 1722 my $line_count = 0;
1325 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> 1846 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
1326 if ($ignore) { 1847 if ($ignore) {
1327 # print "\n\n$meth_call\n"; 1848 # print "\n\n$meth_call\n";
1328 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); 1849 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
1329 # print "$meth_call\n"; 1850 # print "$meth_call\n";
1851
1330 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly 1852 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
1331 1853
1332 my @len = split (/\D+/,$cigar); # storing the length per operation 1854 my @len = split (/\D+/,$cigar); # storing the length per operation
1333 my @ops = split (/\d+/,$cigar); # storing the operation 1855 my @ops = split (/\d+/,$cigar); # storing the operation
1334 shift @ops; # remove the empty first element 1856 shift @ops; # remove the empty first element
1442 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; 1964 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
1443 } 1965 }
1444 1966
1445 if ($meth_call_1 and $meth_call_2) { 1967 if ($meth_call_1 and $meth_call_2) {
1446 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' 1968 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
1969
1447 if ($ignore) { 1970 if ($ignore) {
1448 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); 1971 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
1449 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore); 1972
1450
1451 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified 1973 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
1452 $start_read_1 += $ignore; 1974 $start_read_1 += $ignore;
1453 $end_read_2 -= $ignore; 1975 }
1454 } 1976 if ($ignore_r2) {
1977 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
1978
1979 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified
1980 $end_read_2 -= $ignore_r2;
1981 }
1982
1455 my $end_read_1; 1983 my $end_read_1;
1456 my $start_read_2; 1984 my $start_read_2;
1457 1985
1458 if ($strand eq '+') { 1986 if ($strand eq '+') {
1459 1987
1460 $end_read_1 = $start_read_1+length($meth_call_1)-1; 1988 $end_read_1 = $start_read_1+length($meth_call_1)-1;
1461 $start_read_2 = $end_read_2-length($meth_call_2)+1; 1989 $start_read_2 = $end_read_2-length($meth_call_2)+1;
1462 1990
1463 ## we first pass the first read which is in + orientation on the forward strand 1991 ## we first pass the first read which is in + orientation on the forward strand
1464 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0); 1992 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity
1465 1993
1466 # we next pass the second read which is in - orientation on the reverse strand 1994 # we next pass the second read which is in - orientation on the reverse strand
1467 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 1995 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
1468 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1); 1996 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2);
1469 } else { 1997 }
1998 else {
1470 1999
1471 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! 2000 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
1472 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! 2001 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
1473 2002
1474 ## we first pass the first read which is in - orientation on the reverse strand 2003 ## we first pass the first read which is in - orientation on the reverse strand
1475 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0); 2004 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1);
1476 2005
1477 # we next pass the second read which is in + orientation on the forward strand 2006 # we next pass the second read which is in + orientation on the forward strand
1478 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 2007 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
1479 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2); 2008 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2);
1480 } 2009 }
1481 2010
1482 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls 2011 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
1483 } 2012 }
1484 } 2013 }
1485 } else { # Bismark paired-end SAM output format (default) 2014 }
2015 else { # Bismark paired-end SAM output format (default)
1486 while (<IN>) { 2016 while (<IN>) {
1487 ### SAM format can either start with header lines (starting with @) or start with alignments directly 2017 ### SAM format can either start with header lines (starting with @) or start with alignments directly
1488 if (/^\@/) { # skipping header lines (starting with @) 2018 if (/^\@/) { # skipping header lines (starting with @)
1489 warn "skipping SAM header line:\t$_"; 2019 warn "skipping SAM header line:\t$_";
1490 next; 2020 next;
1583 2113
1584 ### READ 1 2114 ### READ 1
1585 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation 2115 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
1586 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation 2116 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
1587 shift @ops_1; # remove the empty first element 2117 shift @ops_1; # remove the empty first element
1588 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); 2118
2119 die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1);
1589 2120
1590 my @comp_cigar_1; # building an array with all CIGAR operations 2121 my @comp_cigar_1; # building an array with all CIGAR operations
1591 foreach my $index (0..$#len_1) { 2122 foreach my $index (0..$#len_1) {
1592 foreach (1..$len_1[$index]) { 2123 foreach (1..$len_1[$index]) {
1593 # print "$ops_1[$index]"; 2124 # print "$ops_1[$index]";
1610 } 2141 }
1611 } 2142 }
1612 # print "original CIGAR read 2: $cigar_2\n"; 2143 # print "original CIGAR read 2: $cigar_2\n";
1613 # print "original CIGAR read 2: @comp_cigar_2\n"; 2144 # print "original CIGAR read 2: @comp_cigar_2\n";
1614 2145
2146
2147
1615 if ($ignore) { 2148 if ($ignore) {
1616 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' 2149 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' for read 1
1617 ### the methylation calls have already been reversed where necessary 2150 ### the methylation calls have already been reversed where necessary
1618 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); 2151 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
1619 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
1620
1621 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
1622 2152
1623 if ($strand eq '+') { 2153 if ($strand eq '+') {
1624 2154
1625 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start 2155 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
1626 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions 2156 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
1627 my $I_count_1 = 0; 2157 my $I_count_1 = 0;
1628 2158
1629 for (1..$ignore) { 2159 for (1..$ignore) {
1630 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start 2160 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
1631 # print "$_ deleted $op\n"; 2161 # print "$_ deleted $op\n";
1632 2162
1633 while ($op eq 'D') { # repeating this for deletions (D) 2163 while ($op eq 'D') { # repeating this for deletions (D)
1640 } 2170 }
1641 } 2171 }
1642 2172
1643 $start_read_1 += $ignore + $D_count_1 - $I_count_1; 2173 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
1644 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; 2174 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
1645 2175
1646 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
1647
1648 for (1..$ignore) {
1649 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
1650 while ($op eq 'D') { # repeating this for deletions (D)
1651 $op = pop @comp_cigar_2;
1652 }
1653 }
1654 # the start position of reads mapping to the reverse strand is being adjusted further below 2176 # the start position of reads mapping to the reverse strand is being adjusted further below
1655 } elsif ($strand eq '-') { 2177 }
2178 elsif ($strand eq '-') {
1656 2179
1657 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back 2180 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
1658 for (1..$ignore) { 2181 for (1..$ignore) {
1659 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array 2182 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
1660 while ($op eq 'D') { # repeating this for deletions (D) 2183 while ($op eq 'D') { # repeating this for deletions (D)
1661 $op = pop @comp_cigar_1; 2184 $op = pop @comp_cigar_1;
1662 } 2185 }
1663 } 2186 }
1664 # the start position of reads mapping to the reverse strand is being adjusted further below 2187 # the start position of reads mapping to the reverse strand is being adjusted further below
1665 2188
2189 }
2190 }
2191
2192 if ($ignore_r2) {
2193 ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore_r2 <int>' for read 2
2194 ### the methylation calls have already been reversed where necessary
2195 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
2196
2197 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
2198
2199 if ($strand eq '+') {
2200
2201 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
2202
2203 for (1..$ignore_r2) {
2204 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
2205 while ($op eq 'D') { # repeating this for deletions (D)
2206 $op = pop @comp_cigar_2;
2207 }
2208 }
2209 # the start position of reads mapping to the reverse strand is being adjusted further below
2210 }
2211 elsif ($strand eq '-') {
2212
1666 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start 2213 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
1667 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions 2214 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
1668 my $I_count_2 = 0; 2215 my $I_count_2 = 0;
1669 2216
1670 for (1..$ignore) { 2217 for (1..$ignore_r2) {
1671 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start 2218 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
1672 # print "$_ deleted $op\n"; 2219 # print "$_ deleted $op\n";
1673 2220
1674 while ($op eq 'D') { # repeating this for deletions (D) 2221 while ($op eq 'D') { # repeating this for deletions (D)
1675 $D_count_2++; 2222 $D_count_2++;
1679 if ($op eq 'I') { # adjusting the genomic position for insertions (I) 2226 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
1680 $I_count_2++; 2227 $I_count_2++;
1681 } 2228 }
1682 } 2229 }
1683 2230
1684 $start_read_2 += $ignore + $D_count_2 - $I_count_2; 2231 $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2;
1685 # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; 2232 # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
1686 2233 }
1687 } 2234 }
1688 2235
2236 if ($ignore){
1689 ### reconstituting shortened CIGAR string 1 2237 ### reconstituting shortened CIGAR string 1
1690 my $new_cigar_1; 2238 my $new_cigar_1;
1691 my $count_1 = 0; 2239 my $count_1 = 0;
1692 my $last_op_1; 2240 my $last_op_1;
1693 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; 2241 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
1706 } 2254 }
1707 } 2255 }
1708 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count 2256 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
1709 $cigar_1 = $new_cigar_1; 2257 $cigar_1 = $new_cigar_1;
1710 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; 2258 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
2259 }
2260
2261 if ($ignore_r2){
1711 2262
1712 ### reconstituting shortened CIGAR string 2 2263 ### reconstituting shortened CIGAR string 2
1713 my $new_cigar_2; 2264 my $new_cigar_2;
1714 my $count_2 = 0; 2265 my $count_2 = 0;
1715 my $last_op_2; 2266 my $last_op_2;
1720 ++$count_2; 2271 ++$count_2;
1721 next; 2272 next;
1722 } 2273 }
1723 if ($last_op_2 eq $op) { 2274 if ($last_op_2 eq $op) {
1724 ++$count_2; 2275 ++$count_2;
1725 } else { 2276 }
2277 else {
1726 $new_cigar_2 .= "$count_2$last_op_2"; 2278 $new_cigar_2 .= "$count_2$last_op_2";
1727 $last_op_2 = $op; 2279 $last_op_2 = $op;
1728 $count_2 = 1; 2280 $count_2 = 1;
1729 } 2281 }
1730 } 2282 }
1731 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count 2283 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
1732 $cigar_2 = $new_cigar_2; 2284 $cigar_2 = $new_cigar_2;
1733 # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n"; 2285 # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n";
1734 2286 }
1735 } 2287
2288 ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on
1736 2289
1737 if ($strand eq '+') { 2290 if ($strand eq '+') {
1738 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 2291 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
1739 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too 2292 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
1740 # print "reverse: @comp_cigar_2\n"; 2293 # print "reverse: @comp_cigar_2\n";
1749 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't 2302 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
1750 } 2303 }
1751 2304
1752 $end_read_1 = $start_read_1 + $MD_count_1 - 1; 2305 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
1753 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand 2306 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
1754 } else { 2307 }
2308 else {
1755 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 2309 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
1756 2310
1757 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too 2311 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
1758 # print "reverse: @comp_cigar_1\n"; 2312 # print "reverse: @comp_cigar_1\n";
1759 2313
1762 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't 2316 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
1763 } 2317 }
1764 2318
1765 $end_read_1 = $start_read_1; 2319 $end_read_1 = $start_read_1;
1766 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand 2320 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
1767
1768 } 2321 }
1769 2322
1770 if ($strand eq '+') { 2323 if ($strand eq '+') {
1771 ## we first pass the first read which is in + orientation on the forward strand 2324 ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity
1772 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1); 2325 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1);
1773 2326
1774 # we next pass the second read which is in - orientation on the reverse strand 2327 # we next pass the second read which is in - orientation on the reverse strand
1775 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 2328 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
1776 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2); 2329 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2);
1777 } else { 2330 } else {
1778 ## we first pass the first read which is in - orientation on the reverse strand 2331 ## we first pass the first read which is in - orientation on the reverse strand
1779 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1); 2332 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1);
1780 2333
1781 # we next pass the second read which is in + orientation on the forward strand 2334 # we next pass the second read which is in + orientation on the forward strand
1782 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 2335 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
1783 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2); 2336 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2);
1784 } 2337 }
1785 2338
1786 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls 2339 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
1787 } 2340 }
1788 } 2341 }
1789 } 2342 }
1790 } else { 2343 } else {
1791 die "Single-end or paired-end reads not specified properly\n"; 2344 die "Single-end or paired-end reads not specified properly\n";
1792 } 2345 }
1793 2346
1794 print "\n\nProcessed $line_count lines from $filename in total\n"; 2347 warn "\n\nProcessed $line_count lines from $filename in total\n";
1795 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; 2348 warn "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
1796 if ($report) { 2349 if ($report) {
2350 print REPORT "\n\nProcessed $line_count lines from $filename in total\n";
1797 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; 2351 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
1798 } 2352 }
1799 print_splitting_report (); 2353 print_splitting_report ();
1800 } 2354 }
1801 2355
1932 } 2486 }
1933 } 2487 }
1934 2488
1935 2489
1936 2490
1937
1938
1939 sub print_individual_C_methylation_states_paired_end_files{ 2491 sub print_individual_C_methylation_states_paired_end_files{
1940 2492
1941 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_; 2493 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_;
2494
2495 ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2
2496 die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2);
2497
2498 my @methylation_calls = split(//,$meth_call);
2499
2500 #################################################################
2501 ### . for bases not involving cytosines ###
2502 ### X for methylated C in CHG context (was protected) ###
2503 ### x for not methylated C in CHG context (was converted) ###
2504 ### H for methylated C in CHH context (was protected) ###
2505 ### h for not methylated C in CHH context (was converted) ###
2506 ### Z for methylated C in CpG context (was protected) ###
2507 ### z for not methylated C in CpG context (was converted) ###
2508 ### U for methylated C in Unknown context (was protected) ###
2509 ### u for not methylated C in Unknown context (was converted) ###
2510 #################################################################
2511
2512 my $methyl_CHG_count = 0;
2513 my $methyl_CHH_count = 0;
2514 my $methyl_CpG_count = 0;
2515 my $unmethylated_CHG_count = 0;
2516 my $unmethylated_CHH_count = 0;
2517 my $unmethylated_CpG_count = 0;
2518
2519 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
2520 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
2521 my @comp_cigar;
2522
2523 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
2524 if ($cigar =~ /^\d+M$/){
2525 # this check speeds up the extraction process by up to 60%!!!
2526 }
2527 else{ # parsing CIGAR string
2528 my @len;
2529 my @ops;
2530 @len = split (/\D+/,$cigar); # storing the length per operation
2531 @ops = split (/\d+/,$cigar); # storing the operation
2532 shift @ops; # remove the empty first element
2533
2534 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
2535
2536 foreach my $index (0..$#len){
2537 foreach (1..$len[$index]){
2538 # print "$ops[$index]";
2539 push @comp_cigar, $ops[$index];
2540 }
2541 }
2542 # warn "\nDetected CIGAR string: $cigar\n";
2543 # warn "Length of methylation call: ",length $meth_call,"\n";
2544 # warn "number of operations: ",scalar @ops,"\n";
2545 # warn "number of length digits: ",scalar @len,"\n\n";
2546 # print @comp_cigar,"\n";
2547 # print "$meth_call\n\n";
2548 # sleep (1);
2549 }
2550
2551 if ($strand eq '-') {
2552
2553 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
2554 if (@comp_cigar){
2555 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2556 }
2557 # print "reverse CIGAR string: @comp_cigar\n";
2558
2559 ### the start position of paired-end files has already been corrected, see above
2560 }
2561
2562 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
2563
2564 if ($merge_non_CpG) {
2565 if ($no_overlap) { # this has to be read 2...
2566
2567 ### single-file CpG and non-CpG context output
2568 if ($full) {
2569 if ($strand eq '+') {
2570 for my $index (0..$#methylation_calls) {
2571
2572 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2573 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2574 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2575 $cigar_offset += $cigar_mod;
2576 $pos_offset += $pos_mod;
2577 }
2578
2579 ### Returning as soon as the methylation calls start overlapping
2580 if ($start+$index+$pos_offset >= $end_read_1) {
2581 return;
2582 }
2583
2584 if ($methylation_calls[$index] eq 'X') {
2585 $counting{total_meCHG_count}++;
2586 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2587 if ($read_identity == 1){
2588 $mbias_1{CHG}->{$index+1}->{meth}++;
2589 }
2590 else{
2591 $mbias_2{CHG}->{$index+1}->{meth}++;
2592 }
2593 }
2594 elsif ($methylation_calls[$index] eq 'x') {
2595 $counting{total_unmethylated_CHG_count}++;
2596 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2597 if ($read_identity == 1){
2598 $mbias_1{CHG}->{$index+1}->{un}++;
2599 }
2600 else{
2601 $mbias_2{CHG}->{$index+1}->{un}++;
2602 }
2603 }
2604 elsif ($methylation_calls[$index] eq 'Z') {
2605 $counting{total_meCpG_count}++;
2606 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2607 if ($read_identity == 1){
2608 $mbias_1{CpG}->{$index+1}->{meth}++;
2609 }
2610 else{
2611 $mbias_2{CpG}->{$index+1}->{meth}++;
2612 }
2613 }
2614 elsif ($methylation_calls[$index] eq 'z') {
2615 $counting{total_unmethylated_CpG_count}++;
2616 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2617 if ($read_identity == 1){
2618 $mbias_1{CpG}->{$index+1}->{un}++;
2619 }
2620 else{
2621 $mbias_2{CpG}->{$index+1}->{un}++;
2622 }
2623 }
2624 elsif ($methylation_calls[$index] eq 'H') {
2625 $counting{total_meCHH_count}++;
2626 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2627 if ($read_identity == 1){
2628 $mbias_1{CHH}->{$index+1}->{meth}++;
2629 }
2630 else{
2631 $mbias_2{CHH}->{$index+1}->{meth}++;
2632 }
2633 }
2634 elsif ($methylation_calls[$index] eq 'h') {
2635 $counting{total_unmethylated_CHH_count}++;
2636 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2637 if ($read_identity == 1){
2638 $mbias_1{CHH}->{$index+1}->{un}++;
2639 }
2640 else{
2641 $mbias_2{CHH}->{$index+1}->{un}++;
2642 }
2643 }
2644 elsif ($methylation_calls[$index] eq '.'){}
2645 elsif (lc$methylation_calls[$index] eq 'u'){}
2646 else{
2647 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
2648 }
2649 }
2650 }
2651 elsif ($strand eq '-') {
2652 for my $index (0..$#methylation_calls) {
2653
2654 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2655 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2656 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2657 $cigar_offset += $cigar_mod;
2658 $pos_offset += $pos_mod;
2659 }
2660
2661 ### Returning as soon as the methylation calls start overlapping
2662 if ($start-$index+$pos_offset <= $end_read_1) {
2663 return;
2664 }
2665
2666 if ($methylation_calls[$index] eq 'X') {
2667 $counting{total_meCHG_count}++;
2668 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2669 if ($read_identity == 1){
2670 $mbias_1{CHG}->{$index+1}->{meth}++;
2671 }
2672 else{
2673 $mbias_2{CHG}->{$index+1}->{meth}++;
2674 }
2675 }
2676 elsif ($methylation_calls[$index] eq 'x') {
2677 $counting{total_unmethylated_CHG_count}++;
2678 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2679 if ($read_identity == 1){
2680 $mbias_1{CHG}->{$index+1}->{un}++;
2681 }
2682 else{
2683 $mbias_2{CHG}->{$index+1}->{un}++;
2684 }
2685 }
2686 elsif ($methylation_calls[$index] eq 'Z') {
2687 $counting{total_meCpG_count}++;
2688 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2689 if ($read_identity == 1){
2690 $mbias_1{CpG}->{$index+1}->{meth}++;
2691 }
2692 else{
2693 $mbias_2{CpG}->{$index+1}->{meth}++;
2694 }
2695 }
2696 elsif ($methylation_calls[$index] eq 'z') {
2697 $counting{total_unmethylated_CpG_count}++;
2698 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2699 if ($read_identity == 1){
2700 $mbias_1{CpG}->{$index+1}->{un}++;
2701 }
2702 else{
2703 $mbias_2{CpG}->{$index+1}->{un}++;
2704 }
2705 }
2706 elsif ($methylation_calls[$index] eq 'H') {
2707 $counting{total_meCHH_count}++;
2708 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2709 if ($read_identity == 1){
2710 $mbias_1{CHH}->{$index+1}->{meth}++;
2711 }
2712 else{
2713 $mbias_2{CHH}->{$index+1}->{meth}++;
2714 }
2715 }
2716 elsif ($methylation_calls[$index] eq 'h') {
2717 $counting{total_unmethylated_CHH_count}++;
2718 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2719 if ($read_identity == 1){
2720 $mbias_1{CHH}->{$index+1}->{un}++;
2721 }
2722 else{
2723 $mbias_2{CHH}->{$index+1}->{un}++;
2724 }
2725 }
2726 elsif ($methylation_calls[$index] eq '.') {}
2727 elsif (lc$methylation_calls[$index] eq 'u'){}
2728 else{
2729 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
2730 }
2731 }
2732 } else {
2733 die "The read orientation was neither + nor -: '$strand'\n";
2734 }
2735 }
2736
2737 ### strand-specific methylation output
2738 else {
2739 if ($strand eq '+') {
2740 for my $index (0..$#methylation_calls) {
2741
2742 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2743 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2744 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2745 $cigar_offset += $cigar_mod;
2746 $pos_offset += $pos_mod;
2747 }
2748
2749 ### Returning as soon as the methylation calls start overlapping
2750 if ($start+$index+$pos_offset >= $end_read_1) {
2751 return;
2752 }
2753
2754 if ($methylation_calls[$index] eq 'X') {
2755 $counting{total_meCHG_count}++;
2756 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2757 if ($read_identity == 1){
2758 $mbias_1{CHG}->{$index+1}->{meth}++;
2759 }
2760 else{
2761 $mbias_2{CHG}->{$index+1}->{meth}++;
2762 }
2763 }
2764 elsif ($methylation_calls[$index] eq 'x') {
2765 $counting{total_unmethylated_CHG_count}++;
2766 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2767 if ($read_identity == 1){
2768 $mbias_1{CHG}->{$index+1}->{un}++;
2769 }
2770 else{
2771 $mbias_2{CHG}->{$index+1}->{un}++;
2772 }
2773 }
2774 elsif ($methylation_calls[$index] eq 'Z') {
2775 $counting{total_meCpG_count}++;
2776 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2777 if ($read_identity == 1){
2778 $mbias_1{CpG}->{$index+1}->{meth}++;
2779 }
2780 else{
2781 $mbias_2{CpG}->{$index+1}->{meth}++;
2782 }
2783 }
2784 elsif ($methylation_calls[$index] eq 'z') {
2785 $counting{total_unmethylated_CpG_count}++;
2786 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2787 if ($read_identity == 1){
2788 $mbias_1{CpG}->{$index+1}->{un}++;
2789 }
2790 else{
2791 $mbias_2{CpG}->{$index+1}->{un}++;
2792 }
2793 }
2794 elsif ($methylation_calls[$index] eq 'H') {
2795 $counting{total_meCHH_count}++;
2796 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2797 if ($read_identity == 1){
2798 $mbias_1{CHH}->{$index+1}->{meth}++;
2799 }
2800 else{
2801 $mbias_2{CHH}->{$index+1}->{meth}++;
2802 }
2803 }
2804 elsif ($methylation_calls[$index] eq 'h') {
2805 $counting{total_unmethylated_CHH_count}++;
2806 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2807 if ($read_identity == 1){
2808 $mbias_1{CHH}->{$index+1}->{un}++;
2809 }
2810 else{
2811 $mbias_2{CHH}->{$index+1}->{un}++;
2812 }
2813 }
2814 elsif ($methylation_calls[$index] eq '.') {}
2815 elsif (lc$methylation_calls[$index] eq 'u'){}
2816 else{
2817 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2818 }
2819 }
2820 } elsif ($strand eq '-') {
2821 for my $index (0..$#methylation_calls) {
2822
2823 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2824 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2825 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2826 $cigar_offset += $cigar_mod;
2827 $pos_offset += $pos_mod;
2828 }
2829
2830 ### Returning as soon as the methylation calls start overlapping
2831 if ($start-$index+$pos_offset <= $end_read_1) {
2832 return;
2833 }
2834
2835 if ($methylation_calls[$index] eq 'X') {
2836 $counting{total_meCHG_count}++;
2837 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2838 if ($read_identity == 1){
2839 $mbias_1{CHG}->{$index+1}->{meth}++;
2840 }
2841 else{
2842 $mbias_2{CHG}->{$index+1}->{meth}++;
2843 }
2844 }
2845 elsif ($methylation_calls[$index] eq 'x') {
2846 $counting{total_unmethylated_CHG_count}++;
2847 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2848 if ($read_identity == 1){
2849 $mbias_1{CHG}->{$index+1}->{un}++;
2850 }
2851 else{
2852 $mbias_2{CHG}->{$index+1}->{un}++;
2853 }
2854 }
2855 elsif ($methylation_calls[$index] eq 'Z') {
2856 $counting{total_meCpG_count}++;
2857 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2858 if ($read_identity == 1){
2859 $mbias_1{CpG}->{$index+1}->{meth}++;
2860 }
2861 else{
2862 $mbias_2{CpG}->{$index+1}->{meth}++;
2863 }
2864 }
2865 elsif ($methylation_calls[$index] eq 'z') {
2866 $counting{total_unmethylated_CpG_count}++;
2867 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2868 if ($read_identity == 1){
2869 $mbias_1{CpG}->{$index+1}->{un}++;
2870 }
2871 else{
2872 $mbias_2{CpG}->{$index+1}->{un}++;
2873 }
2874 }
2875 elsif ($methylation_calls[$index] eq 'H') {
2876 $counting{total_meCHH_count}++;
2877 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2878 if ($read_identity == 1){
2879 $mbias_1{CHH}->{$index+1}->{meth}++;
2880 }
2881 else{
2882 $mbias_2{CHH}->{$index+1}->{meth}++;
2883 }
2884 }
2885 elsif ($methylation_calls[$index] eq 'h') {
2886 $counting{total_unmethylated_CHH_count}++;
2887 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2888 if ($read_identity == 1){
2889 $mbias_1{CHH}->{$index+1}->{un}++;
2890 }
2891 else{
2892 $mbias_2{CHH}->{$index+1}->{un}++;
2893 }
2894 }
2895 elsif ($methylation_calls[$index] eq '.') {}
2896 elsif (lc$methylation_calls[$index] eq 'u'){}
2897 else{
2898 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2899 }
2900 }
2901 } else {
2902 die "The strand orientation was neither + nor -: '$strand'/n";
2903 }
2904 }
2905 }
2906
2907 ### this is the default paired-end procedure allowing overlaps and using every single C position
2908 ### Still within the 2-CONTEXT ONLY optional section
2909 else {
2910 ### single-file CpG and non-CpG context output
2911 if ($full) {
2912 if ($strand eq '+') {
2913 for my $index (0..$#methylation_calls) {
2914
2915 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2916 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2917 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2918 $cigar_offset += $cigar_mod;
2919 $pos_offset += $pos_mod;
2920 }
2921
2922 if ($methylation_calls[$index] eq 'X') {
2923 $counting{total_meCHG_count}++;
2924 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2925 if ($read_identity == 1){
2926 $mbias_1{CHG}->{$index+1}->{meth}++;
2927 }
2928 else{
2929 $mbias_2{CHG}->{$index+1}->{meth}++;
2930 }
2931 }
2932 elsif ($methylation_calls[$index] eq 'x') {
2933 $counting{total_unmethylated_CHG_count}++;
2934 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2935 if ($read_identity == 1){
2936 $mbias_1{CHG}->{$index+1}->{un}++;
2937 }
2938 else{
2939 $mbias_2{CHG}->{$index+1}->{un}++;
2940 }
2941 }
2942 elsif ($methylation_calls[$index] eq 'Z') {
2943 $counting{total_meCpG_count}++;
2944 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2945 if ($read_identity == 1){
2946 $mbias_1{CpG}->{$index+1}->{meth}++;
2947 }
2948 else{
2949 $mbias_2{CpG}->{$index+1}->{meth}++;
2950 }
2951 }
2952 elsif ($methylation_calls[$index] eq 'z') {
2953 $counting{total_unmethylated_CpG_count}++;
2954 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2955 if ($read_identity == 1){
2956 $mbias_1{CpG}->{$index+1}->{un}++;
2957 }
2958 else{
2959 $mbias_2{CpG}->{$index+1}->{un}++;
2960 }
2961 }
2962 elsif ($methylation_calls[$index] eq 'H') {
2963 $counting{total_meCHH_count}++;
2964 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2965 if ($read_identity == 1){
2966 $mbias_1{CHH}->{$index+1}->{meth}++;
2967 }
2968 else{
2969 $mbias_2{CHH}->{$index+1}->{meth}++;
2970 }
2971 }
2972 elsif ($methylation_calls[$index] eq 'h') {
2973 $counting{total_unmethylated_CHH_count}++;
2974 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2975 if ($read_identity == 1){
2976 $mbias_1{CHH}->{$index+1}->{un}++;
2977 }
2978 else{
2979 $mbias_2{CHH}->{$index+1}->{un}++;
2980 }
2981 }
2982 elsif ($methylation_calls[$index] eq '.') {}
2983 elsif (lc$methylation_calls[$index] eq 'u'){}
2984 else{
2985 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
2986 }
2987 }
2988 } elsif ($strand eq '-') {
2989 for my $index (0..$#methylation_calls) {
2990
2991 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2992 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2993 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2994 $cigar_offset += $cigar_mod;
2995 $pos_offset += $pos_mod;
2996 }
2997
2998 if ($methylation_calls[$index] eq 'X') {
2999 $counting{total_meCHG_count}++;
3000 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3001 if ($read_identity == 1){
3002 $mbias_1{CHG}->{$index+1}->{meth}++;
3003 }
3004 else{
3005 $mbias_2{CHG}->{$index+1}->{meth}++;
3006 }
3007 }
3008 elsif ($methylation_calls[$index] eq 'x') {
3009 $counting{total_unmethylated_CHG_count}++;
3010 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3011 if ($read_identity == 1){
3012 $mbias_1{CHG}->{$index+1}->{un}++;
3013 }
3014 else{
3015 $mbias_2{CHG}->{$index+1}->{un}++;
3016 }
3017 }
3018 elsif ($methylation_calls[$index] eq 'Z') {
3019 $counting{total_meCpG_count}++;
3020 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3021 if ($read_identity == 1){
3022 $mbias_1{CpG}->{$index+1}->{meth}++;
3023 }
3024 else{
3025 $mbias_2{CpG}->{$index+1}->{meth}++;
3026 }
3027 }
3028 elsif ($methylation_calls[$index] eq 'z') {
3029 $counting{total_unmethylated_CpG_count}++;
3030 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3031 if ($read_identity == 1){
3032 $mbias_1{CpG}->{$index+1}->{un}++;
3033 }
3034 else{
3035 $mbias_2{CpG}->{$index+1}->{un}++;
3036 }
3037 }
3038 elsif ($methylation_calls[$index] eq 'H') {
3039 $counting{total_meCHH_count}++;
3040 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3041 if ($read_identity == 1){
3042 $mbias_1{CHH}->{$index+1}->{meth}++;
3043 }
3044 else{
3045 $mbias_2{CHH}->{$index+1}->{meth}++;
3046 }
3047 }
3048 elsif ($methylation_calls[$index] eq 'h') {
3049 $counting{total_unmethylated_CHH_count}++;
3050 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3051 if ($read_identity == 1){
3052 $mbias_1{CHH}->{$index+1}->{un}++;
3053 }
3054 else{
3055 $mbias_2{CHH}->{$index+1}->{un}++;
3056 }
3057 }
3058 elsif ($methylation_calls[$index] eq '.') {}
3059 elsif (lc$methylation_calls[$index] eq 'u'){}
3060 else{
3061 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
3062 }
3063 }
3064 } else {
3065 die "The strand orientation as neither + nor -: '$strand'\n";
3066 }
3067 }
3068
3069 ### strand-specific methylation output
3070 ### still within the 2-CONTEXT optional section
3071 else {
3072 if ($strand eq '+') {
3073 for my $index (0..$#methylation_calls) {
3074
3075 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3076 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3077 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3078 $cigar_offset += $cigar_mod;
3079 $pos_offset += $pos_mod;
3080 }
3081
3082 if ($methylation_calls[$index] eq 'X') {
3083 $counting{total_meCHG_count}++;
3084 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3085 if ($read_identity == 1){
3086 $mbias_1{CHG}->{$index+1}->{meth}++;
3087 }
3088 else{
3089 $mbias_2{CHG}->{$index+1}->{meth}++;
3090 }
3091 }
3092 elsif ($methylation_calls[$index] eq 'x') {
3093 $counting{total_unmethylated_CHG_count}++;
3094 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3095 if ($read_identity == 1){
3096 $mbias_1{CHG}->{$index+1}->{un}++;
3097 }
3098 else{
3099 $mbias_2{CHG}->{$index+1}->{un}++;
3100 }
3101 }
3102 elsif ($methylation_calls[$index] eq 'Z') {
3103 $counting{total_meCpG_count}++;
3104 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3105 if ($read_identity == 1){
3106 $mbias_1{CpG}->{$index+1}->{meth}++;
3107 }
3108 else{
3109 $mbias_2{CpG}->{$index+1}->{meth}++;
3110 }
3111 }
3112 elsif ($methylation_calls[$index] eq 'z') {
3113 $counting{total_unmethylated_CpG_count}++;
3114 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3115 if ($read_identity == 1){
3116 $mbias_1{CpG}->{$index+1}->{un}++;
3117 }
3118 else{
3119 $mbias_2{CpG}->{$index+1}->{un}++;
3120 }
3121 }
3122 elsif ($methylation_calls[$index] eq 'H') {
3123 $counting{total_meCHH_count}++;
3124 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3125 if ($read_identity == 1){
3126 $mbias_1{CHH}->{$index+1}->{meth}++;
3127 }
3128 else{
3129 $mbias_2{CHH}->{$index+1}->{meth}++;
3130 }
3131 }
3132 elsif ($methylation_calls[$index] eq 'h') {
3133 $counting{total_unmethylated_CHH_count}++;
3134 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3135 if ($read_identity == 1){
3136 $mbias_1{CHH}->{$index+1}->{un}++;
3137 }
3138 else{
3139 $mbias_2{CHH}->{$index+1}->{un}++;
3140 }
3141 }
3142 elsif ($methylation_calls[$index] eq '.') {}
3143 elsif (lc$methylation_calls[$index] eq 'u'){}
3144 else{
3145 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3146 }
3147 }
3148 } elsif ($strand eq '-') {
3149 for my $index (0..$#methylation_calls) {
3150
3151 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3152 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3153 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3154 $cigar_offset += $cigar_mod;
3155 $pos_offset += $pos_mod;
3156 }
3157
3158 if ($methylation_calls[$index] eq 'X') {
3159 $counting{total_meCHG_count}++;
3160 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3161 if ($read_identity == 1){
3162 $mbias_1{CHG}->{$index+1}->{meth}++;
3163 }
3164 else{
3165 $mbias_2{CHG}->{$index+1}->{meth}++;
3166 }
3167 }
3168 elsif ($methylation_calls[$index] eq 'x') {
3169 $counting{total_unmethylated_CHG_count}++;
3170 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3171 if ($read_identity == 1){
3172 $mbias_1{CHG}->{$index+1}->{un}++;
3173 }
3174 else{
3175 $mbias_2{CHG}->{$index+1}->{un}++;
3176 }
3177 }
3178 elsif ($methylation_calls[$index] eq 'Z') {
3179 $counting{total_meCpG_count}++;
3180 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3181 if ($read_identity == 1){
3182 $mbias_1{CpG}->{$index+1}->{meth}++;
3183 }
3184 else{
3185 $mbias_2{CpG}->{$index+1}->{meth}++;
3186 }
3187 }
3188 elsif ($methylation_calls[$index] eq 'z') {
3189 $counting{total_unmethylated_CpG_count}++;
3190 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3191 if ($read_identity == 1){
3192 $mbias_1{CpG}->{$index+1}->{un}++;
3193 }
3194 else{
3195 $mbias_2{CpG}->{$index+1}->{un}++;
3196 }
3197 }
3198 elsif ($methylation_calls[$index] eq 'H') {
3199 $counting{total_meCHH_count}++;
3200 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3201 if ($read_identity == 1){
3202 $mbias_1{CHH}->{$index+1}->{meth}++;
3203 }
3204 else{
3205 $mbias_2{CHH}->{$index+1}->{meth}++;
3206 }
3207 }
3208 elsif ($methylation_calls[$index] eq 'h') {
3209 $counting{total_unmethylated_CHH_count}++;
3210 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3211 if ($read_identity == 1){
3212 $mbias_1{CHH}->{$index+1}->{un}++;
3213 }
3214 else{
3215 $mbias_2{CHH}->{$index+1}->{un}++;
3216 }
3217 }
3218 elsif ($methylation_calls[$index] eq '.') {}
3219 elsif (lc$methylation_calls[$index] eq 'u'){}
3220 else{
3221 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3222 }
3223 }
3224 } else {
3225 die "The strand orientation as neither + nor -: '$strand'\n";
3226 }
3227 }
3228 }
3229 }
3230
3231 ############################################
3232 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
3233 ############################################
3234
3235 elsif ($no_overlap) {
3236 ### single-file CpG, CHG and CHH context output
3237 if ($full) {
3238 if ($strand eq '+') {
3239 for my $index (0..$#methylation_calls) {
3240
3241 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3242 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3243 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3244 $cigar_offset += $cigar_mod;
3245 $pos_offset += $pos_mod;
3246 }
3247
3248 ### Returning as soon as the methylation calls start overlapping
3249 if ($start+$index+$pos_offset >= $end_read_1) {
3250 return;
3251 }
3252
3253 if ($methylation_calls[$index] eq 'X') {
3254 $counting{total_meCHG_count}++;
3255 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3256 if ($read_identity == 1){
3257 $mbias_1{CHG}->{$index+1}->{meth}++;
3258 }
3259 else{
3260 $mbias_2{CHG}->{$index+1}->{meth}++;
3261 }
3262 }
3263 elsif ($methylation_calls[$index] eq 'x') {
3264 $counting{total_unmethylated_CHG_count}++;
3265 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3266 if ($read_identity == 1){
3267 $mbias_1{CHG}->{$index+1}->{un}++;
3268 }
3269 else{
3270 $mbias_2{CHG}->{$index+1}->{un}++;
3271 }
3272 }
3273 elsif ($methylation_calls[$index] eq 'Z') {
3274 $counting{total_meCpG_count}++;
3275 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3276 if ($read_identity == 1){
3277 $mbias_1{CpG}->{$index+1}->{meth}++;
3278 }
3279 else{
3280 $mbias_2{CpG}->{$index+1}->{meth}++;
3281 }
3282 }
3283 elsif ($methylation_calls[$index] eq 'z') {
3284 $counting{total_unmethylated_CpG_count}++;
3285 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3286 if ($read_identity == 1){
3287 $mbias_1{CpG}->{$index+1}->{un}++;
3288 }
3289 else{
3290 $mbias_2{CpG}->{$index+1}->{un}++;
3291 }
3292 }
3293 elsif ($methylation_calls[$index] eq 'H') {
3294 $counting{total_meCHH_count}++;
3295 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3296 if ($read_identity == 1){
3297 $mbias_1{CHH}->{$index+1}->{meth}++;
3298 }
3299 else{
3300 $mbias_2{CHH}->{$index+1}->{meth}++;
3301 }
3302 }
3303 elsif ($methylation_calls[$index] eq 'h') {
3304 $counting{total_unmethylated_CHH_count}++;
3305 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3306 if ($read_identity == 1){
3307 $mbias_1{CHH}->{$index+1}->{un}++;
3308 }
3309 else{
3310 $mbias_2{CHH}->{$index+1}->{un}++;
3311 }
3312 }
3313 elsif ($methylation_calls[$index] eq '.') {}
3314 elsif (lc$methylation_calls[$index] eq 'u'){}
3315 else{
3316 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3317 }
3318 }
3319 } elsif ($strand eq '-') {
3320 for my $index (0..$#methylation_calls) {
3321
3322 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3323 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3324 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3325 $cigar_offset += $cigar_mod;
3326 $pos_offset += $pos_mod;
3327 }
3328
3329 ### Returning as soon as the methylation calls start overlapping
3330 if ($start-$index+$pos_offset <= $end_read_1) {
3331 return;
3332 }
3333
3334 if ($methylation_calls[$index] eq 'X') {
3335 $counting{total_meCHG_count}++;
3336 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3337 if ($read_identity == 1){
3338 $mbias_1{CHG}->{$index+1}->{meth}++;
3339 }
3340 else{
3341 $mbias_2{CHG}->{$index+1}->{meth}++;
3342 }
3343 }
3344 elsif ($methylation_calls[$index] eq 'x') {
3345 $counting{total_unmethylated_CHG_count}++;
3346 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3347 if ($read_identity == 1){
3348 $mbias_1{CHG}->{$index+1}->{un}++;
3349 }
3350 else{
3351 $mbias_2{CHG}->{$index+1}->{un}++;
3352 }
3353 }
3354 elsif ($methylation_calls[$index] eq 'Z') {
3355 $counting{total_meCpG_count}++;
3356 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3357 if ($read_identity == 1){
3358 $mbias_1{CpG}->{$index+1}->{meth}++;
3359 }
3360 else{
3361 $mbias_2{CpG}->{$index+1}->{meth}++;
3362 }
3363 }
3364 elsif ($methylation_calls[$index] eq 'z') {
3365 $counting{total_unmethylated_CpG_count}++;
3366 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3367 if ($read_identity == 1){
3368 $mbias_1{CpG}->{$index+1}->{un}++;
3369 }
3370 else{
3371 $mbias_2{CpG}->{$index+1}->{un}++;
3372 }
3373 }
3374 elsif ($methylation_calls[$index] eq 'H') {
3375 $counting{total_meCHH_count}++;
3376 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3377 if ($read_identity == 1){
3378 $mbias_1{CHH}->{$index+1}->{meth}++;
3379 }
3380 else{
3381 $mbias_2{CHH}->{$index+1}->{meth}++;
3382 }
3383 }
3384 elsif ($methylation_calls[$index] eq 'h') {
3385 $counting{total_unmethylated_CHH_count}++;
3386 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3387 if ($read_identity == 1){
3388 $mbias_1{CHH}->{$index+1}->{un}++;
3389 }
3390 else{
3391 $mbias_2{CHH}->{$index+1}->{un}++;
3392 }
3393 }
3394 elsif ($methylation_calls[$index] eq '.') {}
3395 elsif (lc$methylation_calls[$index] eq 'u'){}
3396 else{
3397 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3398 }
3399 }
3400 } else {
3401 die "The strand orientation as neither + nor -: '$strand'\n";
3402 }
3403 }
3404
3405 ### strand-specific methylation output
3406 else {
3407 if ($strand eq '+') {
3408 for my $index (0..$#methylation_calls) {
3409
3410 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3411 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3412 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3413 $cigar_offset += $cigar_mod;
3414 $pos_offset += $pos_mod;
3415 }
3416
3417 ### Returning as soon as the methylation calls start overlapping
3418 if ($start+$index+$pos_offset >= $end_read_1) {
3419 return;
3420 }
3421
3422 if ($methylation_calls[$index] eq 'X') {
3423 $counting{total_meCHG_count}++;
3424 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3425 if ($read_identity == 1){
3426 $mbias_1{CHG}->{$index+1}->{meth}++;
3427 }
3428 else{
3429 $mbias_2{CHG}->{$index+1}->{meth}++;
3430 }
3431 }
3432 elsif ($methylation_calls[$index] eq 'x') {
3433 $counting{total_unmethylated_CHG_count}++;
3434 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3435 if ($read_identity == 1){
3436 $mbias_1{CHG}->{$index+1}->{un}++;
3437 }
3438 else{
3439 $mbias_2{CHG}->{$index+1}->{un}++;
3440 }
3441 }
3442 elsif ($methylation_calls[$index] eq 'Z') {
3443 $counting{total_meCpG_count}++;
3444 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3445 if ($read_identity == 1){
3446 $mbias_1{CpG}->{$index+1}->{meth}++;
3447 }
3448 else{
3449 $mbias_2{CpG}->{$index+1}->{meth}++;
3450 }
3451 }
3452 elsif ($methylation_calls[$index] eq 'z') {
3453 $counting{total_unmethylated_CpG_count}++;
3454 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3455 if ($read_identity == 1){
3456 $mbias_1{CpG}->{$index+1}->{un}++;
3457 }
3458 else{
3459 $mbias_2{CpG}->{$index+1}->{un}++;
3460 }
3461 }
3462 elsif ($methylation_calls[$index] eq 'H') {
3463 $counting{total_meCHH_count}++;
3464 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3465 if ($read_identity == 1){
3466 $mbias_1{CHH}->{$index+1}->{meth}++;
3467 }
3468 else{
3469 $mbias_2{CHH}->{$index+1}->{meth}++;
3470 }
3471 }
3472 elsif ($methylation_calls[$index] eq 'h') {
3473 $counting{total_unmethylated_CHH_count}++;
3474 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3475 if ($read_identity == 1){
3476 $mbias_1{CHH}->{$index+1}->{un}++;
3477 }
3478 else{
3479 $mbias_2{CHH}->{$index+1}->{un}++;
3480 }
3481 }
3482 elsif ($methylation_calls[$index] eq '.') {}
3483 elsif (lc$methylation_calls[$index] eq 'u'){}
3484 else{
3485 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3486 }
3487 }
3488 } elsif ($strand eq '-') {
3489 for my $index (0..$#methylation_calls) {
3490
3491 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3492 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3493 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3494 $cigar_offset += $cigar_mod;
3495 $pos_offset += $pos_mod;
3496 }
3497
3498 ### Returning as soon as the methylation calls start overlapping
3499 if ($start-$index+$pos_offset <= $end_read_1) {
3500 return;
3501 }
3502
3503 if ($methylation_calls[$index] eq 'X') {
3504 $counting{total_meCHG_count}++;
3505 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3506 if ($read_identity == 1){
3507 $mbias_1{CHG}->{$index+1}->{meth}++;
3508 }
3509 else{
3510 $mbias_2{CHG}->{$index+1}->{meth}++;
3511 }
3512 }
3513 elsif ($methylation_calls[$index] eq 'x') {
3514 $counting{total_unmethylated_CHG_count}++;
3515 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3516 if ($read_identity == 1){
3517 $mbias_1{CHG}->{$index+1}->{un}++;
3518 }
3519 else{
3520 $mbias_2{CHG}->{$index+1}->{un}++;
3521 }
3522 }
3523 elsif ($methylation_calls[$index] eq 'Z') {
3524 $counting{total_meCpG_count}++;
3525 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3526 if ($read_identity == 1){
3527 $mbias_1{CpG}->{$index+1}->{meth}++;
3528 }
3529 else{
3530 $mbias_2{CpG}->{$index+1}->{meth}++;
3531 }
3532 }
3533 elsif ($methylation_calls[$index] eq 'z') {
3534 $counting{total_unmethylated_CpG_count}++;
3535 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3536 if ($read_identity == 1){
3537 $mbias_1{CpG}->{$index+1}->{un}++;
3538 }
3539 else{
3540 $mbias_2{CpG}->{$index+1}->{un}++;
3541 }
3542 }
3543 elsif ($methylation_calls[$index] eq 'H') {
3544 $counting{total_meCHH_count}++;
3545 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3546 if ($read_identity == 1){
3547 $mbias_1{CHH}->{$index+1}->{meth}++;
3548 }
3549 else{
3550 $mbias_2{CHH}->{$index+1}->{meth}++;
3551 }
3552 }
3553 elsif ($methylation_calls[$index] eq 'h') {
3554 $counting{total_unmethylated_CHH_count}++;
3555 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3556 if ($read_identity == 1){
3557 $mbias_1{CHH}->{$index+1}->{un}++;
3558 }
3559 else{
3560 $mbias_2{CHH}->{$index+1}->{un}++;
3561 }
3562 }
3563 elsif ($methylation_calls[$index] eq '.') {}
3564 elsif (lc$methylation_calls[$index] eq 'u'){}
3565 else{
3566 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3567 }
3568 }
3569 } else {
3570 die "The strand orientation as neither + nor -: '$strand'\n";
3571 }
3572 }
3573 }
3574
3575 ### this is the default paired-end procedure allowing overlaps and using every single C position
3576 else {
3577 ### single-file CpG, CHG and CHH context output
3578 if ($full) {
3579 if ($strand eq '+') {
3580 for my $index (0..$#methylation_calls) {
3581
3582 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3583 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3584 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3585 $cigar_offset += $cigar_mod;
3586 $pos_offset += $pos_mod;
3587 }
3588
3589 if ($methylation_calls[$index] eq 'X') {
3590 $counting{total_meCHG_count}++;
3591 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3592 if ($read_identity == 1){
3593 $mbias_1{CHG}->{$index+1}->{meth}++;
3594 }
3595 else{
3596 $mbias_2{CHG}->{$index+1}->{meth}++;
3597 }
3598 }
3599 elsif ($methylation_calls[$index] eq 'x') {
3600 $counting{total_unmethylated_CHG_count}++;
3601 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3602 if ($read_identity == 1){
3603 $mbias_1{CHG}->{$index+1}->{un}++;
3604 }
3605 else{
3606 $mbias_2{CHG}->{$index+1}->{un}++;
3607 }
3608 }
3609 elsif ($methylation_calls[$index] eq 'Z') {
3610 $counting{total_meCpG_count}++;
3611 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3612 if ($read_identity == 1){
3613 $mbias_1{CpG}->{$index+1}->{meth}++;
3614 }
3615 else{
3616 $mbias_2{CpG}->{$index+1}->{meth}++;
3617 }
3618 }
3619 elsif ($methylation_calls[$index] eq 'z') {
3620 $counting{total_unmethylated_CpG_count}++;
3621 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3622 if ($read_identity == 1){
3623 $mbias_1{CpG}->{$index+1}->{un}++;
3624 }
3625 else{
3626 $mbias_2{CpG}->{$index+1}->{un}++;
3627 }
3628 }
3629 elsif ($methylation_calls[$index] eq 'H') {
3630 $counting{total_meCHH_count}++;
3631 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3632 if ($read_identity == 1){
3633 $mbias_1{CHH}->{$index+1}->{meth}++;
3634 }
3635 else{
3636 $mbias_2{CHH}->{$index+1}->{meth}++;
3637 }
3638 }
3639 elsif ($methylation_calls[$index] eq 'h') {
3640 $counting{total_unmethylated_CHH_count}++;
3641 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3642 if ($read_identity == 1){
3643 $mbias_1{CHH}->{$index+1}->{un}++;
3644 }
3645 else{
3646 $mbias_2{CHH}->{$index+1}->{un}++;
3647 }
3648 }
3649 elsif ($methylation_calls[$index] eq '.') {}
3650 elsif (lc$methylation_calls[$index] eq 'u'){}
3651 else{
3652 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3653 }
3654 }
3655 } elsif ($strand eq '-') {
3656 for my $index (0..$#methylation_calls) {
3657
3658 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3659 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3660 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3661 $cigar_offset += $cigar_mod;
3662 $pos_offset += $pos_mod;
3663 }
3664
3665 if ($methylation_calls[$index] eq 'X') {
3666 $counting{total_meCHG_count}++;
3667 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3668 if ($read_identity == 1){
3669 $mbias_1{CHG}->{$index+1}->{meth}++;
3670 }
3671 else{
3672 $mbias_2{CHG}->{$index+1}->{meth}++;
3673 }
3674 }
3675 elsif ($methylation_calls[$index] eq 'x') {
3676 $counting{total_unmethylated_CHG_count}++;
3677 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3678 if ($read_identity == 1){
3679 $mbias_1{CHG}->{$index+1}->{un}++;
3680 }
3681 else{
3682 $mbias_2{CHG}->{$index+1}->{un}++;
3683 }
3684 }
3685 elsif ($methylation_calls[$index] eq 'Z') {
3686 $counting{total_meCpG_count}++;
3687 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3688 if ($read_identity == 1){
3689 $mbias_1{CpG}->{$index+1}->{meth}++;
3690 }
3691 else{
3692 $mbias_2{CpG}->{$index+1}->{meth}++;
3693 }
3694 }
3695 elsif ($methylation_calls[$index] eq 'z') {
3696 $counting{total_unmethylated_CpG_count}++;
3697 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3698 if ($read_identity == 1){
3699 $mbias_1{CpG}->{$index+1}->{un}++;
3700 }
3701 else{
3702 $mbias_2{CpG}->{$index+1}->{un}++;
3703 }
3704 }
3705 elsif ($methylation_calls[$index] eq 'H') {
3706 $counting{total_meCHH_count}++;
3707 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3708 if ($read_identity == 1){
3709 $mbias_1{CHH}->{$index+1}->{meth}++;
3710 }
3711 else{
3712 $mbias_2{CHH}->{$index+1}->{meth}++;
3713 }
3714 }
3715 elsif ($methylation_calls[$index] eq 'h') {
3716 $counting{total_unmethylated_CHH_count}++;
3717 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3718 if ($read_identity == 1){
3719 $mbias_1{CHH}->{$index+1}->{un}++;
3720 }
3721 else{
3722 $mbias_2{CHH}->{$index+1}->{un}++;
3723 }
3724 }
3725 elsif ($methylation_calls[$index] eq '.') {}
3726 elsif (lc$methylation_calls[$index] eq 'u'){}
3727 else{
3728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3729 }
3730 }
3731 } else {
3732 die "The strand orientation as neither + nor -: '$strand'\n";
3733 }
3734 }
3735
3736 ### strand-specific methylation output
3737 else {
3738 if ($strand eq '+') {
3739 for my $index (0..$#methylation_calls) {
3740
3741 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3742 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3743 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3744 $cigar_offset += $cigar_mod;
3745 $pos_offset += $pos_mod;
3746 }
3747
3748 if ($methylation_calls[$index] eq 'X') {
3749 $counting{total_meCHG_count}++;
3750 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3751 if ($read_identity == 1){
3752 $mbias_1{CHG}->{$index+1}->{meth}++;
3753 }
3754 else{
3755 $mbias_2{CHG}->{$index+1}->{meth}++;
3756 }
3757 }
3758 elsif ($methylation_calls[$index] eq 'x') {
3759 $counting{total_unmethylated_CHG_count}++;
3760 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3761 if ($read_identity == 1){
3762 $mbias_1{CHG}->{$index+1}->{un}++;
3763 }
3764 else{
3765 $mbias_2{CHG}->{$index+1}->{un}++;
3766 }
3767 }
3768 elsif ($methylation_calls[$index] eq 'Z') {
3769 $counting{total_meCpG_count}++;
3770 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3771 if ($read_identity == 1){
3772 $mbias_1{CpG}->{$index+1}->{meth}++;
3773 }
3774 else{
3775 $mbias_2{CpG}->{$index+1}->{meth}++;
3776 }
3777 }
3778 elsif ($methylation_calls[$index] eq 'z') {
3779 $counting{total_unmethylated_CpG_count}++;
3780 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3781 if ($read_identity == 1){
3782 $mbias_1{CpG}->{$index+1}->{un}++;
3783 }
3784 else{
3785 $mbias_2{CpG}->{$index+1}->{un}++;
3786 }
3787 }
3788 elsif ($methylation_calls[$index] eq 'H') {
3789 $counting{total_meCHH_count}++;
3790 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3791 if ($read_identity == 1){
3792 $mbias_1{CHH}->{$index+1}->{meth}++;
3793 }
3794 else{
3795 $mbias_2{CHH}->{$index+1}->{meth}++;
3796 }
3797 }
3798 elsif ($methylation_calls[$index] eq 'h') {
3799 $counting{total_unmethylated_CHH_count}++;
3800 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3801 if ($read_identity == 1){
3802 $mbias_1{CHH}->{$index+1}->{un}++;
3803 }
3804 else{
3805 $mbias_2{CHH}->{$index+1}->{un}++;
3806 }
3807 }
3808 elsif ($methylation_calls[$index] eq '.') {}
3809 elsif (lc$methylation_calls[$index] eq 'u'){}
3810 else{
3811 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3812 }
3813 }
3814 } elsif ($strand eq '-') {
3815 for my $index (0..$#methylation_calls) {
3816
3817 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3818 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3819 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3820 $cigar_offset += $cigar_mod;
3821 $pos_offset += $pos_mod;
3822 }
3823
3824 if ($methylation_calls[$index] eq 'X') {
3825 $counting{total_meCHG_count}++;
3826 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3827 if ($read_identity == 1){
3828 $mbias_1{CHG}->{$index+1}->{meth}++;
3829 }
3830 else{
3831 $mbias_2{CHG}->{$index+1}->{meth}++;
3832 }
3833 }
3834 elsif ($methylation_calls[$index] eq 'x') {
3835 $counting{total_unmethylated_CHG_count}++;
3836 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3837 if ($read_identity == 1){
3838 $mbias_1{CHG}->{$index+1}->{un}++;
3839 }
3840 else{
3841 $mbias_2{CHG}->{$index+1}->{un}++;
3842 }
3843 }
3844 elsif ($methylation_calls[$index] eq 'Z') {
3845 $counting{total_meCpG_count}++;
3846 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3847 if ($read_identity == 1){
3848 $mbias_1{CpG}->{$index+1}->{meth}++;
3849 }
3850 else{
3851 $mbias_2{CpG}->{$index+1}->{meth}++;
3852 }
3853 }
3854 elsif ($methylation_calls[$index] eq 'z') {
3855 $counting{total_unmethylated_CpG_count}++;
3856 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3857 if ($read_identity == 1){
3858 $mbias_1{CpG}->{$index+1}->{un}++;
3859 }
3860 else{
3861 $mbias_2{CpG}->{$index+1}->{un}++;
3862 }
3863 }
3864 elsif ($methylation_calls[$index] eq 'H') {
3865 $counting{total_meCHH_count}++;
3866 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3867 if ($read_identity == 1){
3868 $mbias_1{CHH}->{$index+1}->{meth}++;
3869 }
3870 else{
3871 $mbias_2{CHH}->{$index+1}->{meth}++;
3872 }
3873 }
3874 elsif ($methylation_calls[$index] eq 'h') {
3875 $counting{total_unmethylated_CHH_count}++;
3876 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3877 if ($read_identity == 1){
3878 $mbias_1{CHH}->{$index+1}->{un}++;
3879 }
3880 else{
3881 $mbias_2{CHH}->{$index+1}->{un}++;
3882 }
3883 }
3884 elsif ($methylation_calls[$index] eq '.') {}
3885 elsif (lc$methylation_calls[$index] eq 'u'){}
3886 else{
3887 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3888 }
3889 }
3890 } else {
3891 die "The strand orientation as neither + nor -: '$strand'\n";
3892 }
3893 }
3894 }
3895 }
3896
3897 sub check_cigar_string {
3898 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
3899 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
3900 my ($new_cigar_offset,$new_pos_offset) = (0,0);
3901
3902 if ($strand eq '+') {
3903 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
3904
3905 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
3906 # warn "position needs no adjustment\n";
3907 }
3908
3909 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
3910 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
3911 # warn "adjusted genomic position by -1 bp (insertion)\n";
3912 }
3913
3914 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
3915 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
3916 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
3917 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
3918
3919 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
3920 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
3921 # warn "position needs no adjustment\n";
3922 last;
3923 }
3924 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
3925 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
3926 # warn "adjusted genomic position by another -1 bp (insertion)\n";
3927 last;
3928 }
3929 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
3930 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
3931 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
3932 # warn "adjusted genomic position by another +1 bp (deletion)\n";
3933 }
3934 else{
3935 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
3936 }
3937 }
3938 }
3939 else{
3940 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
3941 }
3942 }
3943
3944 elsif ($strand eq '-') {
3945 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
3946
3947 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
3948 # warn "position needs no adjustment\n";
3949 }
3950
3951 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
3952 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
3953 # warn "adjusted genomic position by +1 bp (insertion)\n";
3954 }
3955
3956 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
3957 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
3958 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
3959 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
3960
3961 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
3962 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
3963 # warn "Found new 'M' operation; position needs no adjustment\n";
3964 last;
3965 }
3966 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
3967 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
3968 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
3969 last;
3970 }
3971 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
3972 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
3973 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
3974 # warn "adjusted genomic position by another -1 bp (deletion)\n";
3975 }
3976 else{
3977 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
3978 }
3979 }
3980 }
3981 else{
3982 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
3983 }
3984 }
3985 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
3986 return ($new_cigar_offset,$new_pos_offset);
3987 }
3988
3989 sub print_individual_C_methylation_states_single_end{
3990
3991 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
1942 my @methylation_calls = split(//,$meth_call); 3992 my @methylation_calls = split(//,$meth_call);
1943 3993
1944 ################################################################# 3994 #################################################################
1945 ### . for bases not involving cytosines ### 3995 ### . for bases not involving cytosines ###
1946 ### X for methylated C in CHG context (was protected) ### 3996 ### X for methylated C in CHG context (was protected) ###
1956 my $methyl_CpG_count = 0; 4006 my $methyl_CpG_count = 0;
1957 my $unmethylated_CHG_count = 0; 4007 my $unmethylated_CHG_count = 0;
1958 my $unmethylated_CHH_count = 0; 4008 my $unmethylated_CHH_count = 0;
1959 my $unmethylated_CpG_count = 0; 4009 my $unmethylated_CpG_count = 0;
1960 4010
1961
1962 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions 4011 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
1963 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels 4012 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
4013
1964 my @comp_cigar; 4014 my @comp_cigar;
1965 4015
1966 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing 4016 if ($cigar){ # parsing CIGAR string
1967 if ($cigar =~ /^\d+M$/){ 4017
1968 } 4018 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
1969 else{ # parsing CIGAR string 4019 if ($cigar =~ /^\d+M$/){
1970 my @len; 4020 # warn "See!? I told you so! $cigar\n";
1971 my @ops; 4021 # sleep(1);
1972 @len = split (/\D+/,$cigar); # storing the length per operation 4022 }
1973 @ops = split (/\d+/,$cigar); # storing the operation 4023 else{
1974 shift @ops; # remove the empty first element 4024
1975 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); 4025 my @len;
1976 4026 my @ops;
1977 foreach my $index (0..$#len){ 4027
1978 foreach (1..$len[$index]){ 4028 @len = split (/\D+/,$cigar); # storing the length per operation
1979 # print "$ops[$index]"; 4029 @ops = split (/\d+/,$cigar); # storing the operation
1980 push @comp_cigar, $ops[$index]; 4030 shift @ops; # remove the empty first element
4031 # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops);
4032 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
4033
4034 foreach my $index (0..$#len){
4035 foreach (1..$len[$index]){
4036 # print "$ops[$index]";
4037 push @comp_cigar, $ops[$index];
4038 }
1981 } 4039 }
1982 } 4040 }
1983 # warn "\nDetected CIGAR string: $cigar\n"; 4041 # warn "\nDetected CIGAR string: $cigar\n";
1984 # warn "Length of methylation call: ",length $meth_call,"\n"; 4042 # warn "Length of methylation call: ",length $meth_call,"\n";
1985 # warn "number of operations: ",scalar @ops,"\n"; 4043 # warn "number of operations: ",scalar @ops,"\n";
1987 # print @comp_cigar,"\n"; 4045 # print @comp_cigar,"\n";
1988 # print "$meth_call\n\n"; 4046 # print "$meth_call\n\n";
1989 # sleep (1); 4047 # sleep (1);
1990 } 4048 }
1991 4049
1992 if ($strand eq '-') {
1993
1994 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
1995 if (@comp_cigar){
1996 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
1997 }
1998 # print "reverse CIGAR string: @comp_cigar\n";
1999
2000 ### the start position of paired-end files has already been corrected, see above
2001 }
2002
2003 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
2004
2005 if ($merge_non_CpG) {
2006
2007 if ($no_overlap) {
2008
2009 ### single-file CpG and non-CpG context output
2010 if ($full) {
2011 if ($strand eq '+') {
2012 for my $index (0..$#methylation_calls) {
2013
2014 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2015 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2016 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2017 $cigar_offset += $cigar_mod;
2018 $pos_offset += $pos_mod;
2019 }
2020
2021 ### Returning as soon as the methylation calls start overlapping
2022 if ($start+$index+$pos_offset >= $end_read_1) {
2023 return;
2024 }
2025
2026 if ($methylation_calls[$index] eq 'X') {
2027 $counting{total_meCHG_count}++;
2028 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2029 } elsif ($methylation_calls[$index] eq 'x') {
2030 $counting{total_unmethylated_CHG_count}++;
2031 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2032 } elsif ($methylation_calls[$index] eq 'Z') {
2033 $counting{total_meCpG_count}++;
2034 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2035 } elsif ($methylation_calls[$index] eq 'z') {
2036 $counting{total_unmethylated_CpG_count}++;
2037 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2038 } elsif ($methylation_calls[$index] eq 'H') {
2039 $counting{total_meCHH_count}++;
2040 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2041 } elsif ($methylation_calls[$index] eq 'h') {
2042 $counting{total_unmethylated_CHH_count}++;
2043 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2044 }
2045 elsif ($methylation_calls[$index] eq '.'){}
2046 else{
2047 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2048 }
2049 }
2050 } elsif ($strand eq '-') {
2051 for my $index (0..$#methylation_calls) {
2052
2053 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2054 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2055 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2056 $cigar_offset += $cigar_mod;
2057 $pos_offset += $pos_mod;
2058 }
2059
2060 ### Returning as soon as the methylation calls start overlapping
2061 if ($start-$index+$pos_offset <= $end_read_1) {
2062 return;
2063 }
2064
2065 if ($methylation_calls[$index] eq 'X') {
2066 $counting{total_meCHG_count}++;
2067 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2068 } elsif ($methylation_calls[$index] eq 'x') {
2069 $counting{total_unmethylated_CHG_count}++;
2070 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2071 } elsif ($methylation_calls[$index] eq 'Z') {
2072 $counting{total_meCpG_count}++;
2073 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2074 } elsif ($methylation_calls[$index] eq 'z') {
2075 $counting{total_unmethylated_CpG_count}++;
2076 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2077 } elsif ($methylation_calls[$index] eq 'H') {
2078 $counting{total_meCHH_count}++;
2079 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2080 } elsif ($methylation_calls[$index] eq 'h') {
2081 $counting{total_unmethylated_CHH_count}++;
2082 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2083 }
2084 elsif ($methylation_calls[$index] eq '.') {}
2085 else{
2086 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2087 }
2088 }
2089 } else {
2090 die "The read orientation was neither + nor -: '$strand'\n";
2091 }
2092 }
2093
2094 ### strand-specific methylation output
2095 else {
2096 if ($strand eq '+') {
2097 for my $index (0..$#methylation_calls) {
2098
2099 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2100 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2101 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2102 $cigar_offset += $cigar_mod;
2103 $pos_offset += $pos_mod;
2104 }
2105
2106 ### Returning as soon as the methylation calls start overlapping
2107 if ($start+$index+$pos_offset >= $end_read_1) {
2108 return;
2109 }
2110
2111 if ($methylation_calls[$index] eq 'X') {
2112 $counting{total_meCHG_count}++;
2113 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2114 } elsif ($methylation_calls[$index] eq 'x') {
2115 $counting{total_unmethylated_CHG_count}++;
2116 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2117 } elsif ($methylation_calls[$index] eq 'Z') {
2118 $counting{total_meCpG_count}++;
2119 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2120 } elsif ($methylation_calls[$index] eq 'z') {
2121 $counting{total_unmethylated_CpG_count}++;
2122 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2123 } elsif ($methylation_calls[$index] eq 'H') {
2124 $counting{total_meCHH_count}++;
2125 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2126 } elsif ($methylation_calls[$index] eq 'h') {
2127 $counting{total_unmethylated_CHH_count}++;
2128 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2129 }
2130 elsif ($methylation_calls[$index] eq '.') {}
2131 else{
2132 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2133 }
2134 }
2135 } elsif ($strand eq '-') {
2136 for my $index (0..$#methylation_calls) {
2137
2138 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2139 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2140 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2141 $cigar_offset += $cigar_mod;
2142 $pos_offset += $pos_mod;
2143 }
2144
2145 ### Returning as soon as the methylation calls start overlapping
2146 if ($start-$index+$pos_offset <= $end_read_1) {
2147 return;
2148 }
2149
2150 if ($methylation_calls[$index] eq 'X') {
2151 $counting{total_meCHG_count}++;
2152 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2153 } elsif ($methylation_calls[$index] eq 'x') {
2154 $counting{total_unmethylated_CHG_count}++;
2155 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2156 } elsif ($methylation_calls[$index] eq 'Z') {
2157 $counting{total_meCpG_count}++;
2158 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2159 } elsif ($methylation_calls[$index] eq 'z') {
2160 $counting{total_unmethylated_CpG_count}++;
2161 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2162 } elsif ($methylation_calls[$index] eq 'H') {
2163 $counting{total_meCHH_count}++;
2164 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2165 } elsif ($methylation_calls[$index] eq 'h') {
2166 $counting{total_unmethylated_CHH_count}++;
2167 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2168 }
2169 elsif ($methylation_calls[$index] eq '.') {}
2170 else{
2171 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2172 }
2173 }
2174 } else {
2175 die "The strand orientation was neither + nor -: '$strand'/n";
2176 }
2177 }
2178 }
2179
2180 ### this is the default paired-end procedure allowing overlaps and using every single C position
2181 ### Still within the 2-CONTEXT ONLY optional section
2182 else {
2183 ### single-file CpG and non-CpG context output
2184 if ($full) {
2185 if ($strand eq '+') {
2186 for my $index (0..$#methylation_calls) {
2187
2188 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2189 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2190 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2191 $cigar_offset += $cigar_mod;
2192 $pos_offset += $pos_mod;
2193 }
2194
2195 if ($methylation_calls[$index] eq 'X') {
2196 $counting{total_meCHG_count}++;
2197 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2198 } elsif ($methylation_calls[$index] eq 'x') {
2199 $counting{total_unmethylated_CHG_count}++;
2200 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2201 } elsif ($methylation_calls[$index] eq 'Z') {
2202 $counting{total_meCpG_count}++;
2203 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2204 } elsif ($methylation_calls[$index] eq 'z') {
2205 $counting{total_unmethylated_CpG_count}++;
2206 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2207 } elsif ($methylation_calls[$index] eq 'H') {
2208 $counting{total_meCHH_count}++;
2209 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2210 } elsif ($methylation_calls[$index] eq 'h') {
2211 $counting{total_unmethylated_CHH_count}++;
2212 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2213 }
2214 elsif ($methylation_calls[$index] eq '.') {}
2215 else{
2216 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2217 }
2218 }
2219 } elsif ($strand eq '-') {
2220 for my $index (0..$#methylation_calls) {
2221
2222 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2223 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2224 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2225 $cigar_offset += $cigar_mod;
2226 $pos_offset += $pos_mod;
2227 }
2228
2229 if ($methylation_calls[$index] eq 'X') {
2230 $counting{total_meCHG_count}++;
2231 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2232 } elsif ($methylation_calls[$index] eq 'x') {
2233 $counting{total_unmethylated_CHG_count}++;
2234 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2235 } elsif ($methylation_calls[$index] eq 'Z') {
2236 $counting{total_meCpG_count}++;
2237 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2238 } elsif ($methylation_calls[$index] eq 'z') {
2239 $counting{total_unmethylated_CpG_count}++;
2240 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2241 } elsif ($methylation_calls[$index] eq 'H') {
2242 $counting{total_meCHH_count}++;
2243 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2244 } elsif ($methylation_calls[$index] eq 'h') {
2245 $counting{total_unmethylated_CHH_count}++;
2246 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2247 }
2248 elsif ($methylation_calls[$index] eq '.') {}
2249 else{
2250 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2251 }
2252 }
2253 } else {
2254 die "The strand orientation as neither + nor -: '$strand'\n";
2255 }
2256 }
2257
2258 ### strand-specific methylation output
2259 ### still within the 2-CONTEXT optional section
2260 else {
2261 if ($strand eq '+') {
2262 for my $index (0..$#methylation_calls) {
2263
2264 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2265 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2266 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2267 $cigar_offset += $cigar_mod;
2268 $pos_offset += $pos_mod;
2269 }
2270
2271 if ($methylation_calls[$index] eq 'X') {
2272 $counting{total_meCHG_count}++;
2273 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2274 } elsif ($methylation_calls[$index] eq 'x') {
2275 $counting{total_unmethylated_CHG_count}++;
2276 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2277 } elsif ($methylation_calls[$index] eq 'Z') {
2278 $counting{total_meCpG_count}++;
2279 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2280 } elsif ($methylation_calls[$index] eq 'z') {
2281 $counting{total_unmethylated_CpG_count}++;
2282 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2283 } elsif ($methylation_calls[$index] eq 'H') {
2284 $counting{total_meCHH_count}++;
2285 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2286 } elsif ($methylation_calls[$index] eq 'h') {
2287 $counting{total_unmethylated_CHH_count}++;
2288 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2289 }
2290 elsif ($methylation_calls[$index] eq '.') {}
2291 else{
2292 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2293 }
2294 }
2295 } elsif ($strand eq '-') {
2296 for my $index (0..$#methylation_calls) {
2297
2298 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2299 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2300 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2301 $cigar_offset += $cigar_mod;
2302 $pos_offset += $pos_mod;
2303 }
2304
2305 if ($methylation_calls[$index] eq 'X') {
2306 $counting{total_meCHG_count}++;
2307 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2308 } elsif ($methylation_calls[$index] eq 'x') {
2309 $counting{total_unmethylated_CHG_count}++;
2310 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2311 } elsif ($methylation_calls[$index] eq 'Z') {
2312 $counting{total_meCpG_count}++;
2313 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2314 } elsif ($methylation_calls[$index] eq 'z') {
2315 $counting{total_unmethylated_CpG_count}++;
2316 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2317 } elsif ($methylation_calls[$index] eq 'H') {
2318 $counting{total_meCHH_count}++;
2319 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2320 } elsif ($methylation_calls[$index] eq 'h') {
2321 $counting{total_unmethylated_CHH_count}++;
2322 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2323 }
2324 elsif ($methylation_calls[$index] eq '.') {}
2325 else{
2326 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2327 }
2328 }
2329 } else {
2330 die "The strand orientation as neither + nor -: '$strand'\n";
2331 }
2332 }
2333 }
2334 }
2335
2336 ############################################
2337 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
2338 ############################################
2339
2340 elsif ($no_overlap) {
2341 ### single-file CpG, CHG and CHH context output
2342 if ($full) {
2343 if ($strand eq '+') {
2344 for my $index (0..$#methylation_calls) {
2345
2346 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2347 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2348 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2349 $cigar_offset += $cigar_mod;
2350 $pos_offset += $pos_mod;
2351 }
2352
2353 ### Returning as soon as the methylation calls start overlapping
2354 if ($start+$index+$pos_offset >= $end_read_1) {
2355 return;
2356 }
2357
2358 if ($methylation_calls[$index] eq 'X') {
2359 $counting{total_meCHG_count}++;
2360 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2361 } elsif ($methylation_calls[$index] eq 'x') {
2362 $counting{total_unmethylated_CHG_count}++;
2363 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2364 } elsif ($methylation_calls[$index] eq 'Z') {
2365 $counting{total_meCpG_count}++;
2366 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2367 } elsif ($methylation_calls[$index] eq 'z') {
2368 $counting{total_unmethylated_CpG_count}++;
2369 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2370 } elsif ($methylation_calls[$index] eq 'H') {
2371 $counting{total_meCHH_count}++;
2372 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2373 } elsif ($methylation_calls[$index] eq 'h') {
2374 $counting{total_unmethylated_CHH_count}++;
2375 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2376 }
2377 elsif ($methylation_calls[$index] eq '.') {}
2378 else{
2379 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2380 }
2381 }
2382 } elsif ($strand eq '-') {
2383 for my $index (0..$#methylation_calls) {
2384
2385 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2386 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2387 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2388 $cigar_offset += $cigar_mod;
2389 $pos_offset += $pos_mod;
2390 }
2391
2392 ### Returning as soon as the methylation calls start overlapping
2393 if ($start-$index+$pos_offset <= $end_read_1) {
2394 return;
2395 }
2396
2397 if ($methylation_calls[$index] eq 'X') {
2398 $counting{total_meCHG_count}++;
2399 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2400 } elsif ($methylation_calls[$index] eq 'x') {
2401 $counting{total_unmethylated_CHG_count}++;
2402 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2403 } elsif ($methylation_calls[$index] eq 'Z') {
2404 $counting{total_meCpG_count}++;
2405 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2406 } elsif ($methylation_calls[$index] eq 'z') {
2407 $counting{total_unmethylated_CpG_count}++;
2408 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2409 } elsif ($methylation_calls[$index] eq 'H') {
2410 $counting{total_meCHH_count}++;
2411 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2412 } elsif ($methylation_calls[$index] eq 'h') {
2413 $counting{total_unmethylated_CHH_count}++;
2414 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2415 }
2416 elsif ($methylation_calls[$index] eq '.') {}
2417 else{
2418 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2419 }
2420 }
2421 } else {
2422 die "The strand orientation as neither + nor -: '$strand'\n";
2423 }
2424 }
2425
2426 ### strand-specific methylation output
2427 else {
2428 if ($strand eq '+') {
2429 for my $index (0..$#methylation_calls) {
2430
2431 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2432 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2433 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2434 $cigar_offset += $cigar_mod;
2435 $pos_offset += $pos_mod;
2436 }
2437
2438 ### Returning as soon as the methylation calls start overlapping
2439 if ($start+$index+$pos_offset >= $end_read_1) {
2440 return;
2441 }
2442
2443 if ($methylation_calls[$index] eq 'X') {
2444 $counting{total_meCHG_count}++;
2445 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2446 } elsif ($methylation_calls[$index] eq 'x') {
2447 $counting{total_unmethylated_CHG_count}++;
2448 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2449 } elsif ($methylation_calls[$index] eq 'Z') {
2450 $counting{total_meCpG_count}++;
2451 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2452 } elsif ($methylation_calls[$index] eq 'z') {
2453 $counting{total_unmethylated_CpG_count}++;
2454 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2455 } elsif ($methylation_calls[$index] eq 'H') {
2456 $counting{total_meCHH_count}++;
2457 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2458 } elsif ($methylation_calls[$index] eq 'h') {
2459 $counting{total_unmethylated_CHH_count}++;
2460 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2461 }
2462 elsif ($methylation_calls[$index] eq '.') {}
2463 else{
2464 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2465 }
2466 }
2467 } elsif ($strand eq '-') {
2468 for my $index (0..$#methylation_calls) {
2469
2470 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2471 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2472 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2473 $cigar_offset += $cigar_mod;
2474 $pos_offset += $pos_mod;
2475 }
2476
2477 ### Returning as soon as the methylation calls start overlapping
2478 if ($start-$index+$pos_offset <= $end_read_1) {
2479 return;
2480 }
2481
2482 if ($methylation_calls[$index] eq 'X') {
2483 $counting{total_meCHG_count}++;
2484 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2485 } elsif ($methylation_calls[$index] eq 'x') {
2486 $counting{total_unmethylated_CHG_count}++;
2487 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2488 } elsif ($methylation_calls[$index] eq 'Z') {
2489 $counting{total_meCpG_count}++;
2490 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2491 } elsif ($methylation_calls[$index] eq 'z') {
2492 $counting{total_unmethylated_CpG_count}++;
2493 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2494 } elsif ($methylation_calls[$index] eq 'H') {
2495 $counting{total_meCHH_count}++;
2496 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2497 } elsif ($methylation_calls[$index] eq 'h') {
2498 $counting{total_unmethylated_CHH_count}++;
2499 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2500 }
2501 elsif ($methylation_calls[$index] eq '.') {}
2502 else{
2503 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2504 }
2505 }
2506 } else {
2507 die "The strand orientation as neither + nor -: '$strand'\n";
2508 }
2509 }
2510 }
2511
2512 ### this is the default paired-end procedure allowing overlaps and using every single C position
2513 else {
2514 ### single-file CpG, CHG and CHH context output
2515 if ($full) {
2516 if ($strand eq '+') {
2517 for my $index (0..$#methylation_calls) {
2518
2519 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2520 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2521 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2522 $cigar_offset += $cigar_mod;
2523 $pos_offset += $pos_mod;
2524 }
2525
2526 if ($methylation_calls[$index] eq 'X') {
2527 $counting{total_meCHG_count}++;
2528 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2529 } elsif ($methylation_calls[$index] eq 'x') {
2530 $counting{total_unmethylated_CHG_count}++;
2531 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2532 } elsif ($methylation_calls[$index] eq 'Z') {
2533 $counting{total_meCpG_count}++;
2534 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2535 } elsif ($methylation_calls[$index] eq 'z') {
2536 $counting{total_unmethylated_CpG_count}++;
2537 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2538 } elsif ($methylation_calls[$index] eq 'H') {
2539 $counting{total_meCHH_count}++;
2540 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2541 } elsif ($methylation_calls[$index] eq 'h') {
2542 $counting{total_unmethylated_CHH_count}++;
2543 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2544 }
2545 elsif ($methylation_calls[$index] eq '.') {}
2546 else{
2547 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2548 }
2549 }
2550 } elsif ($strand eq '-') {
2551 for my $index (0..$#methylation_calls) {
2552
2553 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2554 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2555 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2556 $cigar_offset += $cigar_mod;
2557 $pos_offset += $pos_mod;
2558 }
2559
2560 if ($methylation_calls[$index] eq 'X') {
2561 $counting{total_meCHG_count}++;
2562 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2563 } elsif ($methylation_calls[$index] eq 'x') {
2564 $counting{total_unmethylated_CHG_count}++;
2565 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2566 } elsif ($methylation_calls[$index] eq 'Z') {
2567 $counting{total_meCpG_count}++;
2568 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2569 } elsif ($methylation_calls[$index] eq 'z') {
2570 $counting{total_unmethylated_CpG_count}++;
2571 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2572 } elsif ($methylation_calls[$index] eq 'H') {
2573 $counting{total_meCHH_count}++;
2574 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2575 } elsif ($methylation_calls[$index] eq 'h') {
2576 $counting{total_unmethylated_CHH_count}++;
2577 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2578 }
2579 elsif ($methylation_calls[$index] eq '.') {}
2580 else{
2581 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2582 }
2583 }
2584 } else {
2585 die "The strand orientation as neither + nor -: '$strand'\n";
2586 }
2587 }
2588
2589 ### strand-specific methylation output
2590 else {
2591 if ($strand eq '+') {
2592 for my $index (0..$#methylation_calls) {
2593
2594 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2595 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2596 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2597 $cigar_offset += $cigar_mod;
2598 $pos_offset += $pos_mod;
2599 }
2600
2601 if ($methylation_calls[$index] eq 'X') {
2602 $counting{total_meCHG_count}++;
2603 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2604 } elsif ($methylation_calls[$index] eq 'x') {
2605 $counting{total_unmethylated_CHG_count}++;
2606 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2607 } elsif ($methylation_calls[$index] eq 'Z') {
2608 $counting{total_meCpG_count}++;
2609 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2610 } elsif ($methylation_calls[$index] eq 'z') {
2611 $counting{total_unmethylated_CpG_count}++;
2612 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2613 } elsif ($methylation_calls[$index] eq 'H') {
2614 $counting{total_meCHH_count}++;
2615 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2616 } elsif ($methylation_calls[$index] eq 'h') {
2617 $counting{total_unmethylated_CHH_count}++;
2618 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2619 }
2620 elsif ($methylation_calls[$index] eq '.') {}
2621 else{
2622 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2623 }
2624 }
2625 } elsif ($strand eq '-') {
2626 for my $index (0..$#methylation_calls) {
2627
2628 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2629 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2630 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2631 $cigar_offset += $cigar_mod;
2632 $pos_offset += $pos_mod;
2633 }
2634
2635 if ($methylation_calls[$index] eq 'X') {
2636 $counting{total_meCHG_count}++;
2637 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2638 } elsif ($methylation_calls[$index] eq 'x') {
2639 $counting{total_unmethylated_CHG_count}++;
2640 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2641 } elsif ($methylation_calls[$index] eq 'Z') {
2642 $counting{total_meCpG_count}++;
2643 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2644 } elsif ($methylation_calls[$index] eq 'z') {
2645 $counting{total_unmethylated_CpG_count}++;
2646 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2647 } elsif ($methylation_calls[$index] eq 'H') {
2648 $counting{total_meCHH_count}++;
2649 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2650 } elsif ($methylation_calls[$index] eq 'h') {
2651 $counting{total_unmethylated_CHH_count}++;
2652 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2653 }
2654 elsif ($methylation_calls[$index] eq '.') {}
2655 else{
2656 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2657 }
2658 }
2659 } else {
2660 die "The strand orientation as neither + nor -: '$strand'\n";
2661 }
2662 }
2663 }
2664 }
2665
2666 sub check_cigar_string {
2667 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
2668 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
2669 my ($new_cigar_offset,$new_pos_offset) = (0,0);
2670
2671 if ($strand eq '+') {
2672 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
2673
2674 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2675 # warn "position needs no adjustment\n";
2676 }
2677
2678 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
2679 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
2680 # warn "adjusted genomic position by -1 bp (insertion)\n";
2681 }
2682
2683 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2684 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2685 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
2686 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
2687
2688 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
2689 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2690 # warn "position needs no adjustment\n";
2691 last;
2692 }
2693 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
2694 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
2695 # warn "adjusted genomic position by another -1 bp (insertion)\n";
2696 last;
2697 }
2698 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2699 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2700 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
2701 # warn "adjusted genomic position by another +1 bp (deletion)\n";
2702 }
2703 else{
2704 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2705 }
2706 }
2707 }
2708 else{
2709 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2710 }
2711 }
2712
2713 elsif ($strand eq '-') {
2714 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
2715
2716 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2717 # warn "position needs no adjustment\n";
2718 }
2719
2720 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
2721 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
2722 # warn "adjusted genomic position by +1 bp (insertion)\n";
2723 }
2724
2725 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2726 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2727 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
2728 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
2729
2730 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
2731 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2732 # warn "Found new 'M' operation; position needs no adjustment\n";
2733 last;
2734 }
2735 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
2736 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
2737 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
2738 last;
2739 }
2740 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2741 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2742 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
2743 # warn "adjusted genomic position by another -1 bp (deletion)\n";
2744 }
2745 else{
2746 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2747 }
2748 }
2749 }
2750 else{
2751 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2752 }
2753 }
2754 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
2755 return ($new_cigar_offset,$new_pos_offset);
2756 }
2757
2758 sub print_individual_C_methylation_states_single_end{
2759
2760 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
2761 my @methylation_calls = split(//,$meth_call);
2762
2763 #################################################################
2764 ### . for bases not involving cytosines ###
2765 ### X for methylated C in CHG context (was protected) ###
2766 ### x for not methylated C in CHG context (was converted) ###
2767 ### H for methylated C in CHH context (was protected) ###
2768 ### h for not methylated C in CHH context (was converted) ###
2769 ### Z for methylated C in CpG context (was protected) ###
2770 ### z for not methylated C in CpG context (was converted) ###
2771 #################################################################
2772
2773 my $methyl_CHG_count = 0;
2774 my $methyl_CHH_count = 0;
2775 my $methyl_CpG_count = 0;
2776 my $unmethylated_CHG_count = 0;
2777 my $unmethylated_CHH_count = 0;
2778 my $unmethylated_CpG_count = 0;
2779
2780 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
2781 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
2782
2783 my @comp_cigar;
2784
2785 if ($cigar){ # parsing CIGAR string
2786
2787 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
2788 if ($cigar =~ /^\d+M$/){
2789 # warn "See!? I told you so! $cigar\n";
2790 # sleep(1);
2791 }
2792 else{
2793
2794 my @len;
2795 my @ops;
2796
2797 @len = split (/\D+/,$cigar); # storing the length per operation
2798 @ops = split (/\d+/,$cigar); # storing the operation
2799 shift @ops; # remove the empty first element
2800 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
2801
2802 foreach my $index (0..$#len){
2803 foreach (1..$len[$index]){
2804 # print "$ops[$index]";
2805 push @comp_cigar, $ops[$index];
2806 }
2807 }
2808 }
2809 # warn "\nDetected CIGAR string: $cigar\n";
2810 # warn "Length of methylation call: ",length $meth_call,"\n";
2811 # warn "number of operations: ",scalar @ops,"\n";
2812 # warn "number of length digits: ",scalar @len,"\n\n";
2813 # print @comp_cigar,"\n";
2814 # print "$meth_call\n\n";
2815 # sleep (1);
2816 }
2817
2818 ### adjusting the start position for all reads mapping to the reverse strand 4050 ### adjusting the start position for all reads mapping to the reverse strand
2819 if ($strand eq '-') { 4051 if ($strand eq '-') {
2820 4052
2821 if (@comp_cigar){ # only needed for SAM reads with InDels 4053 if (@comp_cigar){ # only needed for SAM reads with InDels
2822 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too 4054 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2823 # print @comp_cigar,"\n"; 4055 # print @comp_cigar,"\n";
2824 } 4056 }
2825 4057
2826 unless ($ignore){ ### if --ignore was specified the start position has already been corrected 4058 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
2827 4059
2828 if ($cigar){ ### SAM format 4060 if ($cigar){ ### SAM format
2829 if ($cigar =~ /^(\d+)M$/){ # linear match 4061 if ($cigar =~ /^(\d+)M$/){ # linear match
2830 $start += $1 - 1; 4062 $start += $1 - 1;
2831 } 4063 }
2832 else{ # InDel read 4064 else{ # InDel read
2859 4091
2860 ### methylated Cs (any context) will receive a forward (+) orientation 4092 ### methylated Cs (any context) will receive a forward (+) orientation
2861 ### not methylated Cs (any context) will receive a reverse (-) orientation 4093 ### not methylated Cs (any context) will receive a reverse (-) orientation
2862 if ($methylation_calls[$index] eq 'X') { 4094 if ($methylation_calls[$index] eq 'X') {
2863 $counting{total_meCHG_count}++; 4095 $counting{total_meCHG_count}++;
2864 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4096 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4097 $mbias_1{CHG}->{$index+1}->{meth}++;
2865 } 4098 }
2866 elsif ($methylation_calls[$index] eq 'x') { 4099 elsif ($methylation_calls[$index] eq 'x') {
2867 $counting{total_unmethylated_CHG_count}++; 4100 $counting{total_unmethylated_CHG_count}++;
2868 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4101 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4102 $mbias_1{CHG}->{$index+1}->{un}++;
2869 } 4103 }
2870 elsif ($methylation_calls[$index] eq 'Z') { 4104 elsif ($methylation_calls[$index] eq 'Z') {
2871 $counting{total_meCpG_count}++; 4105 $counting{total_meCpG_count}++;
2872 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4106 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4107 $mbias_1{CpG}->{$index+1}->{meth}++;
2873 } 4108 }
2874 elsif ($methylation_calls[$index] eq 'z') { 4109 elsif ($methylation_calls[$index] eq 'z') {
2875 $counting{total_unmethylated_CpG_count}++; 4110 $counting{total_unmethylated_CpG_count}++;
2876 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4111 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4112 $mbias_1{CpG}->{$index+1}->{un}++;
2877 } 4113 }
2878 elsif ($methylation_calls[$index] eq 'H') { 4114 elsif ($methylation_calls[$index] eq 'H') {
2879 $counting{total_meCHH_count}++; 4115 $counting{total_meCHH_count}++;
2880 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4116 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4117 $mbias_1{CHH}->{$index+1}->{meth}++;
2881 } 4118 }
2882 elsif ($methylation_calls[$index] eq 'h') { 4119 elsif ($methylation_calls[$index] eq 'h') {
2883 $counting{total_unmethylated_CHH_count}++; 4120 $counting{total_unmethylated_CHH_count}++;
2884 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4121 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2885 } 4122 $mbias_1{CHH}->{$index+1}->{un}++;
2886 elsif ($methylation_calls[$index] eq '.') { 4123 }
2887 } 4124 elsif ($methylation_calls[$index] eq '.') {}
4125 elsif (lc$methylation_calls[$index] eq 'u'){}
2888 else{ 4126 else{
2889 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; 4127 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2890 } 4128 }
2891 } 4129 }
2892 } 4130 }
2903 $pos_offset += $pos_mod; 4141 $pos_offset += $pos_mod;
2904 } 4142 }
2905 4143
2906 if ($methylation_calls[$index] eq 'X') { 4144 if ($methylation_calls[$index] eq 'X') {
2907 $counting{total_meCHG_count}++; 4145 $counting{total_meCHG_count}++;
2908 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4146 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4147 $mbias_1{CHG}->{$index+1}->{meth}++;
2909 } 4148 }
2910 elsif ($methylation_calls[$index] eq 'x') { 4149 elsif ($methylation_calls[$index] eq 'x') {
2911 $counting{total_unmethylated_CHG_count}++; 4150 $counting{total_unmethylated_CHG_count}++;
2912 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4151 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4152 $mbias_1{CHG}->{$index+1}->{un}++;
2913 } 4153 }
2914 elsif ($methylation_calls[$index] eq 'Z') { 4154 elsif ($methylation_calls[$index] eq 'Z') {
2915 $counting{total_meCpG_count}++; 4155 $counting{total_meCpG_count}++;
2916 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4156 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4157 $mbias_1{CpG}->{$index+1}->{meth}++;
2917 } 4158 }
2918 elsif ($methylation_calls[$index] eq 'z') { 4159 elsif ($methylation_calls[$index] eq 'z') {
2919 $counting{total_unmethylated_CpG_count}++; 4160 $counting{total_unmethylated_CpG_count}++;
2920 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4161 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4162 $mbias_1{CpG}->{$index+1}->{un}++;
2921 } 4163 }
2922 elsif ($methylation_calls[$index] eq 'H') { 4164 elsif ($methylation_calls[$index] eq 'H') {
2923 $counting{total_meCHH_count}++; 4165 $counting{total_meCHH_count}++;
2924 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4166 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4167 $mbias_1{CHH}->{$index+1}->{meth}++;
2925 } 4168 }
2926 elsif ($methylation_calls[$index] eq 'h') { 4169 elsif ($methylation_calls[$index] eq 'h') {
2927 $counting{total_unmethylated_CHH_count}++; 4170 $counting{total_unmethylated_CHH_count}++;
2928 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4171 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2929 } 4172 $mbias_1{CHH}->{$index+1}->{un}++;
2930 elsif ($methylation_calls[$index] eq '.'){ 4173 }
2931 } 4174 elsif ($methylation_calls[$index] eq '.'){}
4175 elsif (lc$methylation_calls[$index] eq 'u'){}
2932 else{ 4176 else{
2933 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; 4177 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2934 } 4178 }
2935 } 4179 }
2936 } 4180 }
2952 $pos_offset += $pos_mod; 4196 $pos_offset += $pos_mod;
2953 } 4197 }
2954 4198
2955 if ($methylation_calls[$index] eq 'X') { 4199 if ($methylation_calls[$index] eq 'X') {
2956 $counting{total_meCHG_count}++; 4200 $counting{total_meCHG_count}++;
2957 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4201 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4202 $mbias_1{CHG}->{$index+1}->{meth}++;
2958 } 4203 }
2959 elsif ($methylation_calls[$index] eq 'x') { 4204 elsif ($methylation_calls[$index] eq 'x') {
2960 $counting{total_unmethylated_CHG_count}++; 4205 $counting{total_unmethylated_CHG_count}++;
2961 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4206 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4207 $mbias_1{CHG}->{$index+1}->{un}++;
2962 } 4208 }
2963 elsif ($methylation_calls[$index] eq 'Z') { 4209 elsif ($methylation_calls[$index] eq 'Z') {
2964 $counting{total_meCpG_count}++; 4210 $counting{total_meCpG_count}++;
2965 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4211 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4212 $mbias_1{CpG}->{$index+1}->{meth}++;
2966 } 4213 }
2967 elsif ($methylation_calls[$index] eq 'z') { 4214 elsif ($methylation_calls[$index] eq 'z') {
2968 $counting{total_unmethylated_CpG_count}++; 4215 $counting{total_unmethylated_CpG_count}++;
2969 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4216 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4217 $mbias_1{CpG}->{$index+1}->{un}++;
2970 } 4218 }
2971 elsif ($methylation_calls[$index] eq 'H') { 4219 elsif ($methylation_calls[$index] eq 'H') {
2972 $counting{total_meCHH_count}++; 4220 $counting{total_meCHH_count}++;
2973 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4221 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4222 $mbias_1{CHH}->{$index+1}->{meth}++;
2974 } 4223 }
2975 elsif ($methylation_calls[$index] eq 'h') { 4224 elsif ($methylation_calls[$index] eq 'h') {
2976 $counting{total_unmethylated_CHH_count}++; 4225 $counting{total_unmethylated_CHH_count}++;
2977 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4226 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2978 } 4227 $mbias_1{CHH}->{$index+1}->{un}++;
2979 elsif ($methylation_calls[$index] eq '.') { 4228 }
2980 } 4229 elsif ($methylation_calls[$index] eq '.') {}
4230 elsif (lc$methylation_calls[$index] eq 'u'){}
2981 else{ 4231 else{
2982 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; 4232 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2983 } 4233 }
2984 } 4234 }
2985 } 4235 }
2995 $pos_offset += $pos_mod; 4245 $pos_offset += $pos_mod;
2996 } 4246 }
2997 4247
2998 if ($methylation_calls[$index] eq 'X') { 4248 if ($methylation_calls[$index] eq 'X') {
2999 $counting{total_meCHG_count}++; 4249 $counting{total_meCHG_count}++;
3000 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4250 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4251 $mbias_1{CHG}->{$index+1}->{meth}++;
3001 } 4252 }
3002 elsif ($methylation_calls[$index] eq 'x') { 4253 elsif ($methylation_calls[$index] eq 'x') {
3003 $counting{total_unmethylated_CHG_count}++; 4254 $counting{total_unmethylated_CHG_count}++;
3004 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4255 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4256 $mbias_1{CHG}->{$index+1}->{un}++;
3005 } 4257 }
3006 elsif ($methylation_calls[$index] eq 'Z') { 4258 elsif ($methylation_calls[$index] eq 'Z') {
3007 $counting{total_meCpG_count}++; 4259 $counting{total_meCpG_count}++;
3008 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4260 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4261 $mbias_1{CpG}->{$index+1}->{meth}++;
3009 } 4262 }
3010 elsif ($methylation_calls[$index] eq 'z') { 4263 elsif ($methylation_calls[$index] eq 'z') {
3011 $counting{total_unmethylated_CpG_count}++; 4264 $counting{total_unmethylated_CpG_count}++;
3012 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4265 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4266 $mbias_1{CpG}->{$index+1}->{un}++;
3013 } 4267 }
3014 elsif ($methylation_calls[$index] eq 'H') { 4268 elsif ($methylation_calls[$index] eq 'H') {
3015 $counting{total_meCHH_count}++; 4269 $counting{total_meCHH_count}++;
3016 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4270 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4271 $mbias_1{CHH}->{$index+1}->{meth}++;
3017 } 4272 }
3018 elsif ($methylation_calls[$index] eq 'h') { 4273 elsif ($methylation_calls[$index] eq 'h') {
3019 $counting{total_unmethylated_CHH_count}++; 4274 $counting{total_unmethylated_CHH_count}++;
3020 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4275 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3021 } 4276 $mbias_1{CHH}->{$index+1}->{un}++;
3022 elsif ($methylation_calls[$index] eq '.') { 4277 }
3023 } 4278 elsif ($methylation_calls[$index] eq '.') {}
4279 elsif (lc$methylation_calls[$index] eq 'u'){}
3024 else{ 4280 else{
3025 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; 4281 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3026 } 4282 }
3027 } 4283 }
3028 } 4284 }
3042 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels 4298 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
3043 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); 4299 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3044 $cigar_offset += $cigar_mod; 4300 $cigar_offset += $cigar_mod;
3045 $pos_offset += $pos_mod; 4301 $pos_offset += $pos_mod;
3046 } 4302 }
3047 4303
3048 if ($methylation_calls[$index] eq 'X') { 4304 if ($methylation_calls[$index] eq 'X') {
3049 $counting{total_meCHG_count}++; 4305 $counting{total_meCHG_count}++;
3050 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4306 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3051 } elsif ($methylation_calls[$index] eq 'x') { 4307 $mbias_1{CHG}->{$index+1}->{meth}++;
4308 }
4309 elsif ($methylation_calls[$index] eq 'x') {
3052 $counting{total_unmethylated_CHG_count}++; 4310 $counting{total_unmethylated_CHG_count}++;
3053 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4311 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3054 } elsif ($methylation_calls[$index] eq 'Z') { 4312 $mbias_1{CHG}->{$index+1}->{un}++;
4313 }
4314 elsif ($methylation_calls[$index] eq 'Z') {
3055 $counting{total_meCpG_count}++; 4315 $counting{total_meCpG_count}++;
3056 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4316 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3057 } elsif ($methylation_calls[$index] eq 'z') { 4317 $mbias_1{CpG}->{$index+1}->{meth}++;
4318 }
4319 elsif ($methylation_calls[$index] eq 'z') {
3058 $counting{total_unmethylated_CpG_count}++; 4320 $counting{total_unmethylated_CpG_count}++;
3059 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4321 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3060 } elsif ($methylation_calls[$index] eq 'H') { 4322 $mbias_1{CpG}->{$index+1}->{un}++;
4323 }
4324 elsif ($methylation_calls[$index] eq 'H') {
3061 $counting{total_meCHH_count}++; 4325 $counting{total_meCHH_count}++;
3062 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4326 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3063 } elsif ($methylation_calls[$index] eq 'h') { 4327 $mbias_1{CHH}->{$index+1}->{meth}++;
4328 }
4329 elsif ($methylation_calls[$index] eq 'h') {
3064 $counting{total_unmethylated_CHH_count}++; 4330 $counting{total_unmethylated_CHH_count}++;
3065 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4331 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4332 $mbias_1{CHH}->{$index+1}->{un}++;
3066 } 4333 }
3067 elsif ($methylation_calls[$index] eq '.') {} 4334 elsif ($methylation_calls[$index] eq '.') {}
4335 elsif (lc$methylation_calls[$index] eq 'u'){}
3068 else{ 4336 else{
3069 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; 4337 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
3070 } 4338 }
3071 } 4339 }
3072 } 4340 }
3073 elsif ($strand eq '-') { 4341 elsif ($strand eq '-') {
3074 4342
3082 $pos_offset += $pos_mod; 4350 $pos_offset += $pos_mod;
3083 } 4351 }
3084 4352
3085 if ($methylation_calls[$index] eq 'X') { 4353 if ($methylation_calls[$index] eq 'X') {
3086 $counting{total_meCHG_count}++; 4354 $counting{total_meCHG_count}++;
3087 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4355 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3088 } elsif ($methylation_calls[$index] eq 'x') { 4356 $mbias_1{CHG}->{$index+1}->{meth}++;
4357 }
4358 elsif ($methylation_calls[$index] eq 'x') {
3089 $counting{total_unmethylated_CHG_count}++; 4359 $counting{total_unmethylated_CHG_count}++;
3090 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4360 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3091 } elsif ($methylation_calls[$index] eq 'Z') { 4361 $mbias_1{CHG}->{$index+1}->{un}++;
4362 }
4363 elsif ($methylation_calls[$index] eq 'Z') {
3092 $counting{total_meCpG_count}++; 4364 $counting{total_meCpG_count}++;
3093 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4365 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3094 } elsif ($methylation_calls[$index] eq 'z') { 4366 $mbias_1{CpG}->{$index+1}->{meth}++;
4367 }
4368 elsif ($methylation_calls[$index] eq 'z') {
3095 $counting{total_unmethylated_CpG_count}++; 4369 $counting{total_unmethylated_CpG_count}++;
3096 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4370 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3097 } elsif ($methylation_calls[$index] eq 'H') { 4371 $mbias_1{CpG}->{$index+1}->{un}++;
4372 }
4373 elsif ($methylation_calls[$index] eq 'H') {
3098 $counting{total_meCHH_count}++; 4374 $counting{total_meCHH_count}++;
3099 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4375 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3100 } elsif ($methylation_calls[$index] eq 'h') { 4376 $mbias_1{CHH}->{$index+1}->{meth}++;
4377 }
4378 elsif ($methylation_calls[$index] eq 'h') {
3101 $counting{total_unmethylated_CHH_count}++; 4379 $counting{total_unmethylated_CHH_count}++;
3102 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4380 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4381 $mbias_1{CHH}->{$index+1}->{un}++;
3103 } 4382 }
3104 elsif ($methylation_calls[$index] eq '.') {} 4383 elsif ($methylation_calls[$index] eq '.') {}
4384 elsif (lc$methylation_calls[$index] eq 'u'){}
3105 else{ 4385 else{
3106 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; 4386 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3107 } 4387 }
3108 } 4388 }
3109 } 4389 }
3125 $pos_offset += $pos_mod; 4405 $pos_offset += $pos_mod;
3126 } 4406 }
3127 4407
3128 if ($methylation_calls[$index] eq 'X') { 4408 if ($methylation_calls[$index] eq 'X') {
3129 $counting{total_meCHG_count}++; 4409 $counting{total_meCHG_count}++;
3130 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4410 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3131 } elsif ($methylation_calls[$index] eq 'x') { 4411 $mbias_1{CHG}->{$index+1}->{meth}++;
4412 }
4413 elsif ($methylation_calls[$index] eq 'x') {
3132 $counting{total_unmethylated_CHG_count}++; 4414 $counting{total_unmethylated_CHG_count}++;
3133 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4415 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3134 } elsif ($methylation_calls[$index] eq 'Z') { 4416 $mbias_1{CHG}->{$index+1}->{un}++;
4417 }
4418 elsif ($methylation_calls[$index] eq 'Z') {
3135 $counting{total_meCpG_count}++; 4419 $counting{total_meCpG_count}++;
3136 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4420 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3137 } elsif ($methylation_calls[$index] eq 'z') { 4421 $mbias_1{CpG}->{$index+1}->{meth}++;
4422 }
4423 elsif ($methylation_calls[$index] eq 'z') {
3138 $counting{total_unmethylated_CpG_count}++; 4424 $counting{total_unmethylated_CpG_count}++;
3139 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4425 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3140 } elsif ($methylation_calls[$index] eq 'H') { 4426 $mbias_1{CpG}->{$index+1}->{un}++;
4427 }
4428 elsif ($methylation_calls[$index] eq 'H') {
3141 $counting{total_meCHH_count}++; 4429 $counting{total_meCHH_count}++;
3142 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4430 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3143 } elsif ($methylation_calls[$index] eq 'h') { 4431 $mbias_1{CHH}->{$index+1}->{meth}++;
4432 }
4433 elsif ($methylation_calls[$index] eq 'h') {
3144 $counting{total_unmethylated_CHH_count}++; 4434 $counting{total_unmethylated_CHH_count}++;
3145 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n"; 4435 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4436 $mbias_1{CHH}->{$index+1}->{un}++;
3146 } 4437 }
3147 elsif ($methylation_calls[$index] eq '.') {} 4438 elsif ($methylation_calls[$index] eq '.') {}
4439 elsif (lc$methylation_calls[$index] eq 'u'){}
3148 else{ 4440 else{
3149 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; 4441 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3150 } 4442 }
3151 } 4443 }
3152 } 4444 }
3162 $pos_offset += $pos_mod; 4454 $pos_offset += $pos_mod;
3163 } 4455 }
3164 4456
3165 if ($methylation_calls[$index] eq 'X') { 4457 if ($methylation_calls[$index] eq 'X') {
3166 $counting{total_meCHG_count}++; 4458 $counting{total_meCHG_count}++;
3167 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4459 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3168 } elsif ($methylation_calls[$index] eq 'x') { 4460 $mbias_1{CHG}->{$index+1}->{meth}++;
4461 }
4462 elsif ($methylation_calls[$index] eq 'x') {
3169 $counting{total_unmethylated_CHG_count}++; 4463 $counting{total_unmethylated_CHG_count}++;
3170 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4464 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3171 } elsif ($methylation_calls[$index] eq 'Z') { 4465 $mbias_1{CHG}->{$index+1}->{un}++;
4466 }
4467 elsif ($methylation_calls[$index] eq 'Z') {
3172 $counting{total_meCpG_count}++; 4468 $counting{total_meCpG_count}++;
3173 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4469 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3174 } elsif ($methylation_calls[$index] eq 'z') { 4470 $mbias_1{CpG}->{$index+1}->{meth}++;
4471 }
4472 elsif ($methylation_calls[$index] eq 'z') {
3175 $counting{total_unmethylated_CpG_count}++; 4473 $counting{total_unmethylated_CpG_count}++;
3176 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4474 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3177 } elsif ($methylation_calls[$index] eq 'H') { 4475 $mbias_1{CpG}->{$index+1}->{un}++;
4476 }
4477 elsif ($methylation_calls[$index] eq 'H') {
3178 $counting{total_meCHH_count}++; 4478 $counting{total_meCHH_count}++;
3179 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4479 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3180 } elsif ($methylation_calls[$index] eq 'h') { 4480 $mbias_1{CHH}->{$index+1}->{meth}++;
4481 }
4482 elsif ($methylation_calls[$index] eq 'h') {
3181 $counting{total_unmethylated_CHH_count}++; 4483 $counting{total_unmethylated_CHH_count}++;
3182 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n"; 4484 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4485 $mbias_1{CHH}->{$index+1}->{un}++;
3183 } 4486 }
3184 elsif ($methylation_calls[$index] eq '.') {} 4487 elsif ($methylation_calls[$index] eq '.') {}
4488 elsif (lc$methylation_calls[$index] eq 'u'){}
3185 else{ 4489 else{
3186 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; 4490 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3187 } 4491 }
3188 } 4492 }
3189 } 4493 }
3190 else { 4494 else {
3191 die "The strand information was neither + nor -: $strand\n"; 4495 die "The strand information was neither + nor -: $strand\n";
3192 } 4496 }
3193 } 4497 }
3194 } 4498 }
3195
3196
3197
3198 #######################################################################################################################################
3199 ### bismark2bedGaph section - START
3200 #######################################################################################################################################
3201
3202 ### has now been moved to the external script bismark2bedGraph
3203
3204 # sub process_bedGraph_output{
3205 # warn "="x64,"\n";
3206 # warn "Methylation information will now be written into a bedGraph file\n";
3207 # warn "="x64,"\n\n";
3208 # sleep (2);
3209
3210 # ### Closing all filehandles so that the Bismark methtylation extractor output doesn't get truncated due to buffering issues
3211 # foreach my $fh (keys %fhs) {
3212 # if ($fh =~ /^[1230]$/) {
3213 # foreach my $context (keys %{$fhs{$fh}}) {
3214 # close $fhs{$fh}->{$context} or die $!;
3215 # }
3216 # } else {
3217 # close $fhs{$fh} or die $!;
3218 # }
3219 # }
3220
3221 # ### deciding which files to use for bedGraph conversion
3222 # foreach my $filename (@sorting_files){
3223 # # warn "$filename\n";
3224 # if ($filename =~ /\//){ # if files are in a different output folder we extract the filename again
3225 # $filename =~ s/.*\///; # replacing everything up to the last slash in the filename
3226 # # warn "$filename\n";
3227 # }
3228
3229 # if ($CX_context){
3230 # push @bedfiles,$filename;
3231 # }
3232 # else{ ## CpG context only (default)
3233 # if ($filename =~ /^CpG_/){
3234 # push @bedfiles,$filename;
3235 # }
3236 # else{
3237 # # skipping CHH or CHG files
3238 # }
3239 # }
3240 # }
3241
3242 # warn "Using the following files as Input:\n";
3243 # print join ("\t",@bedfiles),"\n\n";
3244 # sleep (2);
3245
3246 # my %temp_fhs;
3247 # my @temp_files; # writing all context files (default CpG only) to these files prior to sorting
3248
3249 # ### changing to the output directory
3250 # unless ($output_dir eq ''){ # default
3251 # chdir $output_dir or die "Failed to change directory to $output_dir\n";
3252 # warn "Changed directory to $output_dir\n";
3253 # }
3254
3255 # foreach my $infile (@bedfiles) {
3256
3257 # if ($remove) {
3258 # warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n";
3259
3260 # if ($infile =~ /gz$/){
3261 # open (READ,"zcat $infile |") or die $!;
3262 # }
3263 # else{
3264 # open (READ,$infile) or die $!;
3265 # }
3266
3267 # my $removed_spaces_outfile = $infile;
3268 # $removed_spaces_outfile =~ s/$/.spaces_removed.txt/;
3269
3270 # open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n";
3271
3272 # unless ($no_header){
3273 # $_ = <READ>; ### Bismark version header
3274 # print REM $_; ### Bismark version header
3275 # }
3276
3277 # while (<READ>) {
3278 # chomp;
3279 # my ($id,$strand,$chr,$pos,$context) = (split (/\t/));
3280 # $id =~ s/\s+/_/g;
3281 # print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n";
3282 # }
3283
3284 # close READ or die $!;
3285 # close REM or die $!;
3286
3287 # ### changing the infile name to the new file without spaces
3288 # $infile = $removed_spaces_outfile;
3289 # }
3290
3291 # warn "Now writing methylation information for file $infile to individual files for each chromosome\n";
3292 # if ($infile =~ /gz$/){
3293 # open (IN,"zcat $infile |") or die $!;
3294 # }
3295 # else{
3296 # open (IN,$infile) or die $!;
3297 # }
3298
3299 # ## always ignoring the version header
3300 # unless ($no_header){
3301 # $_ = <IN>; ### Bismark version header
3302 # }
3303
3304 # while (<IN>) {
3305 # chomp;
3306 # my ($chr) = (split (/\t/))[2];
3307 # # warn "This is the chromosome name before replacing '|' characters:\t$chr\n\n";
3308 # $chr =~ s/\|/_/g; # replacing pipe ('|') characters in the file names
3309 # # warn "This is the chromosome name AFTER replacing '|' characters:\t$chr\n\n";
3310
3311 # unless (exists $temp_fhs{$chr}) {
3312 # open ($temp_fhs{$chr},'>','chr'.$chr.'.meth_extractor.temp') or die "Failed to open filehandle: $!";
3313 # }
3314 # print {$temp_fhs{$chr}} "$_\n";
3315 # }
3316
3317 # warn "Finished writing out individual chromosome files for $infile\n";
3318 # }
3319 # warn "\n";
3320
3321 # @temp_files = <*.meth_extractor.temp>;
3322
3323 # warn "Collecting temporary chromosome file information...\n";
3324 # sleep (1);
3325 # warn "processing the following input file(s):\n";
3326 # warn join ("\n",@temp_files),"\n\n";
3327 # sleep (1);
3328
3329 # foreach my $in (@temp_files) {
3330 # if ($sort_size){
3331 # warn "Sorting input file $in by positions (using -S of '$sort_size')\n";
3332 # }
3333 # else{
3334 # warn "Sorting input file $in by positions (using default memory settings)\n";
3335 # }
3336 # my $sort_dir = $output_dir;
3337 # if ($sort_dir eq ''){
3338 # $sort_dir = './';
3339 # }
3340 # open my $ifh, "sort -S $sort_size -T $sort_dir -k3,3 -k4,4n $in |" or die "Input file could not be sorted. $!";
3341 # # print "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n";
3342
3343 # ############################################# m.a.bentley - moved the variables out of the while loop to hold the current line data {
3344
3345 # my $name;
3346 # my $meth_state;
3347 # my $chr = "";
3348 # my $pos = 0;
3349 # my $meth_state2;
3350
3351 # my $last_pos;
3352 # my $last_chr;
3353
3354 # ############################################# }
3355
3356 # while (my $line = <$ifh>) {
3357 # next if $line =~ /^Bismark/;
3358 # chomp $line;
3359
3360 # ########################################### m.a.bentley - (1) set the last_chr and last_pos variables early in the while loop, before the line split (2) removed unnecessary setting of same variables in if statement {
3361
3362 # $last_chr = $chr;
3363 # $last_pos = $pos;
3364 # ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
3365
3366 # if (($last_pos ne $pos) || ($last_chr ne $chr)) {
3367 # generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
3368 # @methylcalls = qw (0 0 0);
3369 # }
3370
3371 # ############################################# }
3372
3373 # my $validated = validate_methylation_call($meth_state, $meth_state2);
3374 # unless($validated){
3375 # warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
3376 # next;
3377 # }
3378 # if ($meth_state eq "+") {
3379 # $methylcalls[0]++;
3380 # $methylcalls[2]++;
3381 # } else {
3382 # $methylcalls[1]++;
3383 # $methylcalls[2]++;
3384 # }
3385 # }
3386
3387 # ############################################# m.a.bentley - set the last_chr and last_pos variables for the last line in the file (outside the while loop's scope using the method i've implemented) {
3388
3389 # $last_chr = $chr;
3390 # $last_pos = $pos;
3391 # if ($methylcalls[2] > 0) {
3392 # generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
3393 # }
3394 # ############################################# }
3395
3396 # close $ifh or die $!;
3397
3398 # @methylcalls = qw (0 0 0); # resetting @methylcalls
3399
3400 # ### deleting temporary files
3401 # my $delete = unlink $in;
3402 # if ($delete) {
3403 # warn "Successfully deleted the temporary input file $in\n\n";
3404 # }
3405 # else {
3406 # warn "The temporary inputfile $in could not be deleted $!\n\n";
3407 # }
3408 # }
3409 # }
3410
3411 # sub generate_output{
3412 # my $methcount = $methylcalls[0];
3413 # my $nonmethcount = $methylcalls[1];
3414 # my $totalcount = $methylcalls[2];
3415 # my $last_chr = shift;
3416 # my $last_pos = shift;
3417 # croak "Should not be generating output if there's no reads to this region" unless $totalcount > 0;
3418 # croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if $totalcount != ($methcount + $nonmethcount);
3419
3420 # ############################################# m.a.bentley - declare a new variable 'bed_pos' to distinguish from bismark positions (-1) - previous scripts modified the last_pos variable earlier in the script leading to problems in meth % calculation {
3421
3422 # my $bed_pos = $last_pos -1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based.
3423 # my $meth_percentage;
3424 # ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef);
3425 # # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/;
3426 # if (defined $meth_percentage){
3427 # if ($counts){
3428 # print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
3429 # }
3430 # else{
3431 # print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\n";
3432 # }
3433 # }
3434 # ############################################# }
3435 # }
3436
3437 # sub validate_methylation_call{
3438 # my $meth_state = shift;
3439 # croak "Missing (+/-) methylation call" unless defined $meth_state;
3440 # my $meth_state2 = shift;
3441 # croak "Missing alphabetical methylation call" unless defined $meth_state2;
3442 # my $is_consistent;
3443 # ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2))
3444 # : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2));
3445 # return 1 if $is_consistent;
3446 # return 0;
3447 # }
3448
3449 # sub check_CpG_methylation_call{
3450 # my $meth1 = shift;
3451 # my $meth2 = shift;
3452 # return 1 if($meth1 eq "+" && $meth2 eq "Z");
3453 # return 1 if($meth1 eq "-" && $meth2 eq "z");
3454 # return 0;
3455 # }
3456
3457 # sub check_nonCpG_methylation_call{
3458 # my $meth1 = shift;
3459 # my $meth2 = shift;
3460 # return 1 if($meth1 eq "+" && $meth2 eq "C");
3461 # return 1 if($meth1 eq "+" && $meth2 eq "X");
3462 # return 1 if($meth1 eq "+" && $meth2 eq "H");
3463 # return 1 if($meth1 eq "-" && $meth2 eq "c");
3464 # return 1 if($meth1 eq "-" && $meth2 eq "x");
3465 # return 1 if($meth1 eq "-" && $meth2 eq "h");
3466 # return 0;
3467 # }
3468
3469 #######################################################################################################################################
3470 ### bismark2bedGaph section - END
3471 #######################################################################################################################################
3472
3473
3474
3475
3476
3477
3478 # #######################################################################################################################################
3479 # ### genome-wide cytosine methylation report - START
3480 # #######################################################################################################################################
3481
3482 ### has now been moved to the external script bedGraph2cytosine
3483
3484 # sub generate_genome_wide_cytosine_report {
3485
3486 # warn "="x78,"\n";
3487 # warn "Methylation information will now be written into a genome-wide cytosine report\n";
3488 # warn "="x78,"\n\n";
3489 # sleep (2);
3490
3491 # ### changing to the output directory again
3492 # unless ($output_dir eq ''){ # default
3493 # chdir $output_dir or die "Failed to change directory to $output_dir\n";
3494 # # warn "Changed directory to $output_dir\n";
3495 # }
3496
3497 # my $in = shift;
3498 # open (IN,$in) or die $!;
3499
3500 # my $cytosine_out = shift;
3501
3502 # if ($CX_context){
3503 # $cytosine_out =~ s/$/genome-wide_CX_report.txt/;
3504 # }
3505 # else{
3506 # $cytosine_out =~ s/$/genome_wide_CpG_report.txt/;
3507 # }
3508
3509 # ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands
3510 # unless ($split_by_chromosome){ ### writing all output to a single file (default)
3511 # open (CYT,'>',$cytosine_out) or die $!;
3512 # warn "Writing genome-wide cytosine report to: $cytosine_out\n";
3513 # sleep (3);
3514 # }
3515
3516 # my $last_chr;
3517 # my %chr; # storing reads for one chromosome at a time
3518
3519 # my $count = 0;
3520 # while (<IN>){
3521 # chomp;
3522 # ++$count;
3523 # my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/);
3524
3525 # # defining the first chromosome
3526 # unless (defined $last_chr){
3527 # $last_chr = $chr;
3528 # # warn "Storing all covered cytosine positions for chromosome: $chr\n";
3529 # }
3530
3531 # if ($chr eq $last_chr){
3532 # $chr{$chr}->{$start}->{meth} = $meth;
3533 # $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
3534 # }
3535 # else{
3536 # warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
3537
3538 # if ($split_by_chromosome){ ## writing output to 1 file per chromosome
3539 # my $chromosome_out = $cytosine_out;
3540 # $chromosome_out =~ s/txt$/chr${last_chr}.txt/;
3541 # open (CYT,'>',$chromosome_out) or die $!;
3542 # }
3543
3544 # while ( $chromosomes{$last_chr} =~ /([CG])/g){
3545
3546 # my $tri_nt = '';
3547 # my $context = '';
3548 # my $pos = pos$chromosomes{$last_chr};
3549
3550 # my $strand;
3551 # my $meth = 0;
3552 # my $nonmeth = 0;
3553
3554 # if ($1 eq 'C'){ # C on forward strand
3555 # $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
3556 # $strand = '+';
3557 # }
3558 # elsif ($1 eq 'G'){ # C on reverse strand
3559 # $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
3560 # $tri_nt = reverse $tri_nt;
3561 # $tri_nt =~ tr/ACTG/TGAC/;
3562 # $strand = '-';
3563 # }
3564 # next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
3565
3566 # if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based!
3567 # $meth = $chr{$last_chr}->{$pos-1}->{meth};
3568 # $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth};
3569 # }
3570
3571 # ### determining cytosine context
3572 # if ($tri_nt =~ /^CG/){
3573 # $context = 'CG';
3574 # }
3575 # elsif ($tri_nt =~ /^C.{1}G$/){
3576 # $context = 'CHG';
3577 # }
3578 # elsif ($tri_nt =~ /^C.{2}$/){
3579 # $context = 'CHH';
3580 # }
3581 # else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
3582 # warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n";
3583 # next;
3584 # }
3585
3586 # if ($CpG_only){
3587 # if ($tri_nt =~ /^CG/){ # CpG context is the default
3588 # if ($zero){ # zero based coordinates
3589 # $pos -= 1;
3590 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3591 # }
3592 # else{ # default
3593 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3594 # }
3595 # }
3596 # }
3597 # else{ ## all cytosines, specified with --CX
3598 # if ($zero){ # zero based coordinates
3599 # $pos -= 1;
3600 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3601 # }
3602 # else{ # default
3603 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3604 # }
3605 # }
3606 # }
3607
3608 # %chr = (); # resetting the hash
3609
3610 # # new first entry
3611 # $last_chr = $chr;
3612 # $chr{$chr}->{$start}->{meth} = $meth;
3613 # $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
3614 # }
3615 # }
3616
3617 # # Last found chromosome
3618 # warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
3619
3620 # if ($split_by_chromosome){ ## writing output to 1 file per chromosome
3621 # my $chromosome_out = $cytosine_out;
3622 # $chromosome_out =~ s/txt$/chr${last_chr}.txt/;
3623 # open (CYT,'>',$chromosome_out) or die $!;
3624 # }
3625
3626 # while ( $chromosomes{$last_chr} =~ /([CG])/g){
3627
3628 # my $tri_nt;
3629 # my $context;
3630 # my $pos = pos$chromosomes{$last_chr};
3631
3632 # my $strand;
3633 # my $meth = 0;
3634 # my $nonmeth = 0;
3635
3636 # if ($1 eq 'C'){ # C on forward strand
3637 # $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
3638 # $strand = '+';
3639 # }
3640 # elsif ($1 eq 'G'){ # C on reverse strand
3641 # $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
3642 # $tri_nt = reverse $tri_nt;
3643 # $tri_nt =~ tr/ACTG/TGAC/;
3644 # $strand = '-';
3645 # }
3646
3647 # if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based!
3648 # $meth = $chr{$last_chr}->{$pos-1}->{meth};
3649 # $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth};
3650 # }
3651
3652 # next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
3653
3654 # ### determining cytosine context
3655 # if ($tri_nt =~ /^CG/){
3656 # $context = 'CG';
3657 # }
3658 # elsif ($tri_nt =~ /^C.{1}G$/){
3659 # $context = 'CHG';
3660 # }
3661 # elsif ($tri_nt =~ /^C.{2}$/){
3662 # $context = 'CHH';
3663 # }
3664 # else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
3665 # warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n";
3666 # next;
3667 # }
3668
3669 # if ($CpG_only){
3670 # if ($tri_nt =~ /^CG/){ # CpG context is the default
3671 # if ($zero){ # zero-based coordinates
3672 # $pos -= 1;
3673 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3674 # }
3675 # else{ # default
3676 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3677 # }
3678 # }
3679 # }
3680 # else{ ## all cytosines, specified with --CX
3681 # if ($zero){ # zero based coordinates
3682 # $pos -= 1;
3683 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3684 # }
3685 # else{ # default
3686 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3687 # }
3688 # }
3689 # }
3690 # close CYT or die $!;
3691 # }
3692
3693
3694 # sub read_genome_into_memory{
3695
3696 # ## reading in and storing the specified genome in the %chromosomes hash
3697 # chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
3698 # warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
3699
3700 # my @chromosome_filenames = <*.fa>;
3701
3702 # ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
3703 # unless (@chromosome_filenames){
3704 # @chromosome_filenames = <*.fasta>;
3705 # }
3706 # unless (@chromosome_filenames){
3707 # die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
3708 # }
3709
3710 # foreach my $chromosome_filename (@chromosome_filenames){
3711
3712 # # skipping the tophat entire mouse genome fasta file
3713 # next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa');
3714
3715 # open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
3716 # ### first line needs to be a fastA header
3717 # my $first_line = <CHR_IN>;
3718 # chomp $first_line;
3719 # $first_line =~ s/\r//; # removing /r carriage returns
3720
3721 # ### Extracting chromosome name from the FastA header
3722 # my $chromosome_name = extract_chromosome_name($first_line);
3723
3724 # my $sequence;
3725 # while (<CHR_IN>){
3726 # chomp;
3727 # $_ =~ s/\r//; # removing /r carriage returns
3728
3729 # if ($_ =~ /^>/){
3730 # ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
3731 # if (exists $chromosomes{$chromosome_name}){
3732 # warn "chr $chromosome_name (",length $sequence ," bp)\n";
3733 # die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
3734 # }
3735 # else {
3736 # if (length($sequence) == 0){
3737 # warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
3738 # }
3739 # warn "chr $chromosome_name (",length $sequence ," bp)\n";
3740 # $chromosomes{$chromosome_name} = $sequence;
3741 # }
3742 # ### resetting the sequence variable
3743 # $sequence = '';
3744 # ### setting new chromosome name
3745 # $chromosome_name = extract_chromosome_name($_);
3746 # }
3747 # else{
3748 # $sequence .= uc$_;
3749 # }
3750 # }
3751
3752 # if (exists $chromosomes{$chromosome_name}){
3753 # warn "chr $chromosome_name (",length $sequence ," bp)\t";
3754 # die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
3755 # }
3756 # else{
3757 # if (length($sequence) == 0){
3758 # warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
3759 # }
3760 # warn "chr $chromosome_name (",length $sequence ," bp)\n";
3761 # $chromosomes{$chromosome_name} = $sequence;
3762 # }
3763 # }
3764 # warn "\n";
3765 # chdir $parent_dir or die "Failed to move to directory $parent_dir\n";
3766 # }
3767
3768 # sub extract_chromosome_name {
3769 # ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well
3770 # my $fasta_header = shift;
3771 # if ($fasta_header =~ s/^>//){
3772 # my ($chromosome_name) = split (/\s+/,$fasta_header);
3773 # return $chromosome_name;
3774 # }
3775 # else{
3776 # die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
3777 # }
3778 # }
3779
3780 # #######################################################################################################################################
3781 # ### genome-wide cytosine methylation report - END
3782 # #######################################################################################################################################
3783
3784 4499
3785 4500
3786 4501
3787 sub print_helpfile{ 4502 sub print_helpfile{
3788 4503
3795 methylation extractor. The script reads in a bisulfite read alignment results file 4510 methylation extractor. The script reads in a bisulfite read alignment results file
3796 produced by the Bismark bisulfite mapper and extracts the methylation information 4511 produced by the Bismark bisulfite mapper and extracts the methylation information
3797 for individual cytosines. This information is found in the methylation call field 4512 for individual cytosines. This information is found in the methylation call field
3798 which can contain the following characters: 4513 which can contain the following characters:
3799 4514
3800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4515 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3801 ~~~ X for methylated C in CHG context (was protected) ~~~ 4516 ~~~ X for methylated C in CHG context ~~~
3802 ~~~ x for not methylated C CHG (was converted) ~~~ 4517 ~~~ x for not methylated C CHG ~~~
3803 ~~~ H for methylated C in CHH context (was protected) ~~~ 4518 ~~~ H for methylated C in CHH context ~~~
3804 ~~~ h for not methylated C in CHH context (was converted) ~~~ 4519 ~~~ h for not methylated C in CHH context ~~~
3805 ~~~ Z for methylated C in CpG context (was protected) ~~~ 4520 ~~~ Z for methylated C in CpG context ~~~
3806 ~~~ z for not methylated C in CpG context (was converted) ~~~ 4521 ~~~ z for not methylated C in CpG context ~~~
3807 ~~~ . for any bases not involving cytosines ~~~ 4522 ~~~ U for methylated C in Unknown context (CN or CHN ~~~
3808 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4523 ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~
4524 ~~~ . for any bases not involving cytosines ~~~
4525 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3809 4526
3810 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH 4527 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
3811 context (this distinction is actually already made in Bismark itself). As the methylation 4528 context (this distinction is actually already made in Bismark itself). As the methylation
3812 information for every C analysed can produce files which easily have tens or even hundreds of 4529 information for every C analysed can produce files which easily have tens or even hundreds of
3813 millions of lines, file sizes can become very large and more difficult to handle. The C 4530 millions of lines, file sizes can become very large and more difficult to handle. The C
3837 4554
3838 USAGE: methylation_extractor [options] <filenames> 4555 USAGE: methylation_extractor [options] <filenames>
3839 4556
3840 4557
3841 ARGUMENTS: 4558 ARGUMENTS:
4559 ==========
3842 4560
3843 <filenames> A space-separated list of Bismark result files in SAM format from 4561 <filenames> A space-separated list of Bismark result files in SAM format from
3844 which methylation information is extracted for every cytosine in 4562 which methylation information is extracted for every cytosine in
3845 the reads. For alignment files in the older custom Bismark output 4563 the reads. For alignment files in the older custom Bismark output
3846 see option '--vanilla'. 4564 see option '--vanilla'.
3866 Whilst this option removes a bias towards more methylation calls 4584 Whilst this option removes a bias towards more methylation calls
3867 in the center of sequenced fragments it may de facto remove a sizable 4585 in the center of sequenced fragments it may de facto remove a sizable
3868 proportion of the data. This option is highly recommended for paired-end 4586 proportion of the data. This option is highly recommended for paired-end
3869 data. 4587 data.
3870 4588
3871 --ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the 4589 --ignore <int> Ignore the first <int> bp from the 5' end of Read 1 when processing the
3872 methylation call string. This can remove e.g. a restriction enzyme site 4590 methylation call string. This can remove e.g. a restriction enzyme site
3873 at the start of each read. 4591 at the start of each read or any other source of bias (e.g. PBAT-Seq data).
3874 4592
3875 --comprehensive Specifying this option will merge all four possible strand-specific 4593 --ignore_r2 <int> Ignore the first <int> bp from the 5' end of Read 2 of paired-end sequencing
3876 methylation info into context-dependent output files. The default 4594 results only. Since the first couple of bases in Read 2 of BS-Seq experiments
4595 show a severe bias towards non-methylation as a result of end-repairing
4596 sonicated fragments with unmethylated cytosines (see M-bias plot), it is
4597 recommended that the first couple of bp of Read 2 are removed before
4598 starting downstream analysis. Please see the section on M-bias plots in the
4599 Bismark User Guide for more details.
4600
4601 --comprehensive Specifying this option will merge all four possible strand-specific
4602 methylation info into context-dependent output files. The default
4603
3877 contexts are: 4604 contexts are:
3878 - CpG context 4605 - CpG context
3879 - CHG context 4606 - CHG context
3880 - CHH context 4607 - CHH context
3881 4608
3902 4629
3903 --version Displays version information. 4630 --version Displays version information.
3904 4631
3905 -h/--help Displays this help file and exits. 4632 -h/--help Displays this help file and exits.
3906 4633
4634 --mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as
4635 well as a report (optional) and then quit. Default: OFF.
4636
3907 4637
3908 4638
3909 bedGraph specific options: 4639 bedGraph specific options:
4640 ==========================
3910 4641
3911 --bedGraph After finishing the methylation extraction, the methylation output is written into a 4642 --bedGraph After finishing the methylation extraction, the methylation output is written into a
3912 sorted bedGraph file that reports the position of a given cytosine and its methylation 4643 sorted bedGraph file that reports the position of a given cytosine and its methylation
3913 state (in %, see details below). The methylation extractor output is temporarily split up into 4644 state (in %, see details below). The methylation extractor output is temporarily split up into
3914 temporary files, one per chromosome (written into the current directory or folder 4645 temporary files, one per chromosome (written into the current directory or folder
3925 before its methylation percentage is reported. Default: 1. 4656 before its methylation percentage is reported. Default: 1.
3926 4657
3927 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. 4658 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
3928 4659
3929 4660
3930 --counts Adds two additional columns to the output file to enable further calculations:
3931 col 5: number of methylated calls
3932 col 6: number of unmethylated calls
3933 This option is required if '--cytosine_report' is specified (and will be set automatically if
3934 necessary).
3935
3936 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered 4661 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
3937 in the experiment irrespective of its sequence context. This applies to both forward and 4662 in the experiment irrespective of its sequence context. This applies to both forward and
3938 reverse strands. Please be aware that this option may generate large temporary and output files 4663 reverse strands. Please be aware that this option may generate large temporary and output files
3939 and may take a long time to sort (up to many hours). Default: OFF. 4664 and may take a long time to sort (up to many hours). Default: OFF.
3940 (i.e. Default = CpG context only). 4665 (i.e. Default = CpG context only).
3943 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or 4668 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
3944 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. 4669 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
3945 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. 4670 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
3946 Defaults to 2G. 4671 Defaults to 2G.
3947 4672
4673 --scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of
4674 scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to
4675 individual chromosome files. These errors were caused by the operating system's limit
4676 of the number of filehandle that can be written to at any one time (typically 1024; to
4677 find out this limit on Linux, type: ulimit -a).
4678 To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort
4679 methylation calls into individual chromosome files. Instead, all input files are
4680 temporarily merged into a single file (unless there is only a single file), and this
4681 file will then be sorted by both chromosome AND position using the Unix sort command.
4682 Please be aware that this option might take a looooong time to complete, depending on
4683 the size of the input files, and the memory you allocate to this process (see --buffer_size).
4684 Nevertheless, it seems to be working.
4685
4686 --ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will
4687 instead use two arrays to sort methylated and unmethylated calls. This may result in a faster
4688 sorting process of very large files, but this comes at the cost of a larger memory footprint
4689 (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB
4690 of RAM). Due to overheads in creating and looping through these arrays it seems that it will
4691 actually be *slower* for small files (few million alignments), and we are currently testing at
4692 which point it is advisable to use this option. Note that --ample_memory is not compatible
4693 with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with).
4694
4695
3948 4696
3949 Genome-wide cytosine methylation report specific options: 4697 Genome-wide cytosine methylation report specific options:
4698 =========================================================
3950 4699
3951 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a 4700 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a
3952 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based 4701 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based
3953 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all 4702 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all
3954 cytosine context is optional). The output considers all Cs on both forward and reverse strands and 4703 cytosine context is optional). The output considers all Cs on both forward and reverse strands and
3981 * Methylated cytosines receive a '+' orientation, 4730 * Methylated cytosines receive a '+' orientation,
3982 * Unmethylated cytosines receive a '-' orientation. 4731 * Unmethylated cytosines receive a '-' orientation.
3983 4732
3984 4733
3985 4734
3986 The bedGraph output (optional) looks like this (tab-delimited): 4735 The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords):
3987 =============================================================== 4736 =========================================================================================================
4737
4738 track type=bedGraph (header line)
4739
3988 <chromosome> <start position> <end position> <methylation percentage> 4740 <chromosome> <start position> <end position> <methylation percentage>
3989 4741
3990 The bedGraph output with '--counts' specified looks like this (tab-delimited): 4742
4743
4744 The coverage output looks like this (tab-delimited, 1-based genomic coords):
4745 ============================================================================
3991 4746
3992 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated> 4747 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated>
3993 4748
3994 4749
3995 4750
3997 ========================================================================================== 4752 ==========================================================================================
3998 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> 4753 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context>
3999 4754
4000 4755
4001 4756
4002 This script was last modified on 21 April 2013. 4757 This script was last modified on 25 November 2013.
4003 4758
4004 HOW_TO 4759 HOW_TO
4005 } 4760 }