0
+ − 1 #!/usr/bin/perl
+ − 2 use warnings;
+ − 3 use strict;
+ − 4 $|++;
+ − 5 use Getopt::Long;
+ − 6 use Cwd;
+ − 7 use Carp;
+ − 8 use FindBin qw($Bin);
+ − 9 use lib "$Bin/../lib";
+ − 10
3
+ − 11
0
+ − 12 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
+ − 13
+ − 14 ## This program is free software: you can redistribute it and/or modify
+ − 15 ## it under the terms of the GNU General Public License as published by
+ − 16 ## the Free Software Foundation, either version 3 of the License, or
+ − 17 ## (at your option) any later version.
+ − 18
+ − 19 ## This program is distributed in the hope that it will be useful,
+ − 20 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
+ − 21 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ − 22 ## GNU General Public License for more details.
+ − 23
+ − 24 ## You should have received a copy of the GNU General Public License
+ − 25 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
+ − 26
+ − 27 my @filenames; # input files
+ − 28 my %counting;
+ − 29 my $parent_dir = getcwd();
+ − 30
+ − 31 my %fhs;
+ − 32
3
+ − 33 my $version = 'v0.10.1';
+ − 34 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem) = process_commandline();
0
+ − 35
+ − 36
+ − 37 ### only needed for bedGraph output
+ − 38 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files
+ − 39 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
+ − 40 my @bedfiles;
+ − 41
+ − 42 ### only needed for genome-wide cytosine methylation report
+ − 43 my %chromosomes;
+ − 44
3
+ − 45 my %mbias_1;
+ − 46 my %mbias_2;
+ − 47
0
+ − 48 ##############################################################################################
+ − 49 ### Summarising Run Parameters
+ − 50 ##############################################################################################
+ − 51
+ − 52 ### METHYLATION EXTRACTOR
+ − 53
+ − 54 warn "Summarising Bismark methylation extractor parameters:\n";
+ − 55 warn '='x63,"\n";
+ − 56
+ − 57 if ($single){
+ − 58 if ($vanilla){
+ − 59 warn "Bismark single-end vanilla format specified\n";
+ − 60 }
+ − 61 else{
+ − 62 warn "Bismark single-end SAM format specified (default)\n"; # default
+ − 63 }
+ − 64 }
+ − 65 elsif ($paired){
+ − 66 if ($vanilla){
+ − 67 warn "Bismark paired-end vanilla format specified\n";
+ − 68 }
+ − 69 else{
+ − 70 warn "Bismark paired-end SAM format specified (default)\n"; # default
+ − 71 }
+ − 72 }
+ − 73
3
+ − 74 if ($single){
+ − 75 if ($ignore){
+ − 76 warn "First $ignore bp will be disregarded when processing the methylation call string\n";
+ − 77 }
0
+ − 78 }
3
+ − 79 else{ ## paired-end
+ − 80 if ($ignore){
+ − 81 warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n";
+ − 82 }
+ − 83 if ($ignore_r2){
+ − 84 warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n";
+ − 85 }
+ − 86 }
+ − 87
0
+ − 88
+ − 89 if ($full){
+ − 90 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
+ − 91 }
+ − 92 if ($merge_non_CpG){
+ − 93 warn "Merge CHG and CHH context to non-CpG context specified\n";
+ − 94 }
+ − 95 ### output directory
+ − 96 if ($output_dir eq ''){
+ − 97 warn "Output will be written to the current directory ('$parent_dir')\n";
+ − 98 }
+ − 99 else{
+ − 100 warn "Output path specified as: $output_dir\n";
+ − 101 }
+ − 102
+ − 103
+ − 104 sleep (1);
+ − 105
+ − 106 ### BEDGRAPH
+ − 107
+ − 108 if ($bedGraph){
+ − 109 warn "\n\nSummarising bedGraph parameters:\n";
+ − 110 warn '='x63,"\n";
+ − 111
+ − 112 if ($counts){
3
+ − 113 warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage>\ncoverage format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>\n\n";
0
+ − 114 }
+ − 115 else{
+ − 116 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n";
+ − 117 }
+ − 118
+ − 119 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n";
+ − 120
+ − 121 if ($CX_context){
+ − 122 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n";
+ − 123 }
+ − 124 else{ # default
+ − 125 $CpG_only = 1;
+ − 126 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n";
+ − 127 }
+ − 128
+ − 129 if ($remove){
+ − 130 warn "White spaces in read ID names will be removed prior to sorting\n";
+ − 131 }
+ − 132
3
+ − 133 if ($ample_mem){
+ − 134 warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n";
+ − 135 }
+ − 136 elsif (defined $sort_size){
0
+ − 137 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n";
+ − 138 }
+ − 139 else{
+ − 140 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n";
+ − 141 }
+ − 142
+ − 143
+ − 144
+ − 145 sleep (1);
+ − 146
+ − 147 if ($cytosine_report){
+ − 148 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n";
+ − 149 warn '='x63,"\n";
+ − 150 warn "Generating comprehensive genome-wide cytosine report\n(output format: <Chromosome> <Position> <Strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> )\n";
+ − 151
+ − 152
+ − 153 if ($CX_context){
+ − 154 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n";
+ − 155 }
+ − 156 else{ # default
+ − 157 $CpG_only = 1;
+ − 158 warn "Reporting cytosine methylation in CpG context only (default)\n";
+ − 159 }
+ − 160
+ − 161 if ($split_by_chromosome){
+ − 162 warn "Splitting the cytosine report output up into individual files for each chromosome\n";
+ − 163 }
+ − 164
+ − 165 ### Zero-based coordinates
+ − 166 if ($zero){
+ − 167 warn "Using zero-based genomic coordinates (user-defined)\n";
+ − 168 }
+ − 169 else{ # default, 1-based coords
+ − 170 warn "Using 1-based genomic coordinates (default)\n";
+ − 171 }
+ − 172
+ − 173 ### GENOME folder
+ − 174 if ($genome_folder){
+ − 175 unless ($genome_folder =~/\/$/){
+ − 176 $genome_folder =~ s/$/\//;
+ − 177 }
+ − 178 warn "Genome folder was specified as $genome_folder\n";
+ − 179 }
+ − 180 else{
+ − 181 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/';
+ − 182 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n";
+ − 183 }
+ − 184 sleep (1);
+ − 185 }
+ − 186 }
+ − 187
+ − 188 warn "\n";
+ − 189 sleep (5);
+ − 190
+ − 191 ######################################################
+ − 192 ### PROCESSING FILES
+ − 193 ######################################################
+ − 194
+ − 195 foreach my $filename (@filenames){
+ − 196 # resetting counters and filehandles
+ − 197 %fhs = ();
+ − 198 %counting =(
+ − 199 total_meCHG_count => 0,
+ − 200 total_meCHH_count => 0,
+ − 201 total_meCpG_count => 0,
+ − 202 total_unmethylated_CHG_count => 0,
+ − 203 total_unmethylated_CHH_count => 0,
+ − 204 total_unmethylated_CpG_count => 0,
+ − 205 sequences_count => 0,
+ − 206 );
3
+ − 207
0
+ − 208 @sorting_files = ();
+ − 209 @bedfiles = ();
+ − 210
3
+ − 211 %mbias_1 = ();
+ − 212 %mbias_2 = ();
+ − 213
+ − 214 ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor
+ − 215 unless ($vanilla){
+ − 216 if ($paired){
+ − 217 test_positional_sorting($filename);
+ − 218 }
+ − 219 }
+ − 220
0
+ − 221 process_Bismark_results_file($filename);
+ − 222
+ − 223 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues
+ − 224 foreach my $fh (keys %fhs) {
+ − 225 if ($fh =~ /^[1230]$/) {
+ − 226 foreach my $context (keys %{$fhs{$fh}}) {
+ − 227 close $fhs{$fh}->{$context} or die $!;
+ − 228 }
3
+ − 229 }
+ − 230 else{
0
+ − 231 close $fhs{$fh} or die $!;
+ − 232 }
+ − 233 }
+ − 234
3
+ − 235 ### printing out all M-Bias data
+ − 236 produce_mbias_plots ($filename);
+ − 237
+ − 238 delete_unused_files();
+ − 239
0
+ − 240 if ($bedGraph){
+ − 241
+ − 242 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
+ − 243 $out =~ s/gz$//;
+ − 244 $out =~ s/sam$//;
+ − 245 $out =~ s/bam$//;
+ − 246 $out =~ s/txt$//;
+ − 247 $out =~ s/$/bedGraph/;
+ − 248
+ − 249 my $bedGraph_output = $out;
+ − 250 my @args;
+ − 251
+ − 252 if ($remove){
+ − 253 push @args, '--remove';
+ − 254 }
+ − 255 if ($CX_context){
+ − 256 push @args, '--CX_context';
+ − 257 }
+ − 258 if ($no_header){
+ − 259 push @args, '--no_header';
+ − 260 }
3
+ − 261 if ($gazillion){
+ − 262 push @args, '--gazillion';
+ − 263 }
+ − 264 if ($ample_mem){
+ − 265 push @args, '--ample_memory';
0
+ − 266 }
+ − 267
3
+ − 268
+ − 269 # if ($counts){
+ − 270 # push @args, "--counts";
+ − 271 # }
+ − 272
0
+ − 273 push @args, "--buffer_size $sort_size";
+ − 274 push @args, "--cutoff $coverage_threshold";
+ − 275 push @args, "--output $bedGraph_output";
+ − 276 push @args, "--dir '$output_dir'";
+ − 277
+ − 278 ### adding all files to be sorted to @args
+ − 279 foreach my $f (@sorting_files){
+ − 280 push @args, $f;
+ − 281 }
+ − 282
+ − 283 # print join "\t",@args,"\n";
+ − 284
+ − 285 system ("$Bin/bismark2bedGraph @args");
+ − 286
+ − 287 warn "Finished BedGraph conversion ...\n\n";
+ − 288 sleep(3);
+ − 289
+ − 290 # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!";
+ − 291 # warn "Writing bedGraph to file: $bedGraph_output\n";
+ − 292 # process_bedGraph_output();
+ − 293 # close OUT or die $!;
+ − 294
+ − 295 ### genome-wide cytosine methylation report requires bedGraph processing anyway
+ − 296 if ($cytosine_report){
3
+ − 297
0
+ − 298 @args = (); # resetting @args
+ − 299 my $cytosine_out = $out;
+ − 300 $cytosine_out =~ s/bedGraph$//;
+ − 301
+ − 302 if ($CX_context){
+ − 303 $cytosine_out =~ s/$/CX_report.txt/;
+ − 304 }
+ − 305 else{
+ − 306 $cytosine_out =~ s/$/CpG_report.txt/;
+ − 307 }
+ − 308
+ − 309 push @args, "--output $cytosine_out";
+ − 310 push @args, "--dir '$output_dir'";
+ − 311 push @args, "--genome '$genome_folder'";
+ − 312 push @args, "--parent_dir '$parent_dir'";
+ − 313
+ − 314 if ($zero){
+ − 315 push @args, "--zero";
+ − 316 }
+ − 317 if ($CX_context){
+ − 318 push @args, '--CX_context';
+ − 319 }
+ − 320 if ($split_by_chromosome){
+ − 321 push @args, '--split_by_chromosome';
+ − 322 }
+ − 323
3
+ − 324 my $coverage_output = $bedGraph_output;
+ − 325 $coverage_output =~ s/bedGraph$/bismark.cov/;
+ − 326
+ − 327 push @args, $output_dir . $coverage_output; # this will be the infile
+ − 328
+ − 329 system ("$Bin/coverage2cytosine @args");
0
+ − 330 # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out);
+ − 331 warn "\n\nFinished generating genome-wide cytosine report\n\n";
+ − 332 }
+ − 333 }
+ − 334 }
+ − 335
3
+ − 336 sub delete_unused_files{
+ − 337
+ − 338 warn "Deleting unused files ...\n\n"; sleep(1);
+ − 339
+ − 340 my $index = 0;
+ − 341
+ − 342 while ($index <= $#sorting_files){
+ − 343 if ($sorting_files[$index] =~ /gz$/){
+ − 344 open (USED,"zcat $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
+ − 345 }
+ − 346 else{
+ − 347 open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
+ − 348 }
+ − 349
+ − 350 my $used = 0;
+ − 351
+ − 352 while (<USED>){
+ − 353 next if (/^Bismark/);
+ − 354 if ($_){
+ − 355 $used = 1;
+ − 356 last;
+ − 357 }
+ − 358 }
+ − 359
+ − 360 if ($used){
+ − 361 warn "$sorting_files[$index] contains data ->\tkept\n";
+ − 362 ++$index;
+ − 363 }
+ − 364 else{
+ − 365
+ − 366 my $delete = unlink $sorting_files[$index];
+ − 367
+ − 368 if ($delete){
+ − 369 warn "$sorting_files[$index] was empty ->\tdeleted\n";
+ − 370 }
+ − 371 else{
+ − 372 warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n"
+ − 373 }
+ − 374
+ − 375 ### we also need to remove the element from @sorting_files
+ − 376 splice @sorting_files, $index, 1;
+ − 377 }
+ − 378 }
+ − 379 warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately)
+ − 380 }
+ − 381
+ − 382 sub produce_mbias_plots{
+ − 383
+ − 384 my $filename = shift;
+ − 385
+ − 386 my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
+ − 387 $mbias =~ s/gz$//;
+ − 388 $mbias =~ s/sam$//;
+ − 389 $mbias =~ s/bam$//;
+ − 390 $mbias =~ s/txt$//;
+ − 391 my $mbias_graph_1 = my $mbias_graph_2 = $mbias;
+ − 392 $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png';
+ − 393 $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png';
+ − 394
+ − 395 $mbias =~ s/$/M-bias.txt/;
+ − 396
+ − 397 open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n";
+ − 398
+ − 399 # determining maximum read length
+ − 400 my $max_length_1 = 0;
+ − 401 my $max_length_2 = 0;
+ − 402
+ − 403 foreach my $context (keys %mbias_1){
+ − 404 foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){
+ − 405 $max_length_1 = $pos unless ($max_length_1 >= $pos);
+ − 406 }
+ − 407 }
+ − 408 if ($paired){
+ − 409 foreach my $context (keys %mbias_2){
+ − 410 foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){
+ − 411 $max_length_2 = $pos unless ($max_length_2 >= $pos);
+ − 412 }
+ − 413 }
+ − 414 }
+ − 415
+ − 416 if ($single){
+ − 417 warn "Determining maximum read length for M-Bias plot\n";
+ − 418 warn "Maximum read length of Read 1: $max_length_1\n\n";
+ − 419 }
+ − 420 else{
+ − 421 warn "Determining maximum read lengths for M-Bias plots\n";
+ − 422 warn "Maximum read length of Read 1: $max_length_1\n";
+ − 423 warn "Maximum read length of Read 2: $max_length_2\n\n";
+ − 424 }
+ − 425 # sleep(3);
+ − 426
+ − 427 my @mbias_read1;
+ − 428 my @mbias_read2;
+ − 429
+ − 430 #Check whether the module GD::Graph:lines is installed
+ − 431 my $gd_graph_installed = 0;
+ − 432 eval{
+ − 433 require GD::Graph::lines;
+ − 434 GD::Graph::lines->import();
+ − 435 };
+ − 436
+ − 437 unless($@) { # syntax or routine error variable, set if something goes wron in the last eval{ require ...}
+ − 438 $gd_graph_installed = 1;
+ − 439
+ − 440 #Check whether the module GD::Graph::colour is installed
+ − 441 eval{
+ − 442 require GD::Graph::colour;
+ − 443 GD::Graph::colour->import(qw(:colours :lists :files :convert));
+ − 444 };
+ − 445
+ − 446 if ($@) {
+ − 447 warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
+ − 448 sleep(2);
+ − 449 $gd_graph_installed = 0;
+ − 450 }
+ − 451
+ − 452
+ − 453 }
+ − 454 else{
+ − 455 warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
+ − 456 sleep(2);
+ − 457 }
+ − 458
+ − 459
+ − 460 my $graph_title;
+ − 461 my $graph1;
+ − 462 my $graph2;
+ − 463
+ − 464 if ( $gd_graph_installed){
+ − 465 $graph1 = GD::Graph::lines->new(800,600);
+ − 466 if ($paired){
+ − 467 $graph2 = GD::Graph::lines->new(800,600);
+ − 468 }
+ − 469 }
+ − 470
+ − 471 foreach my $context (qw(CpG CHG CHH)){
+ − 472 @{$mbias_read1[0]} = ();
+ − 473
+ − 474 if ($paired){
+ − 475 print MBIAS "$context context (R1)\n================\n";
+ − 476 $graph_title = 'M-bias (Read 1)';
+ − 477 }
+ − 478 else{
+ − 479 print MBIAS "$context context\n===========\n";
+ − 480 $graph_title = 'M-bias';
+ − 481 }
+ − 482 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
+ − 483
+ − 484 foreach my $pos (1..$max_length_1){
+ − 485
+ − 486 unless (defined $mbias_1{$context}->{$pos}->{meth}){
+ − 487 $mbias_1{$context}->{$pos}->{meth} = 0;
+ − 488 }
+ − 489 unless (defined $mbias_1{$context}->{$pos}->{un}){
+ − 490 $mbias_1{$context}->{$pos}->{un} = 0;
+ − 491 }
+ − 492
+ − 493 my $percent = '';
+ − 494 if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){
+ − 495 $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) );
+ − 496 }
+ − 497 my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth};
+ − 498
+ − 499 print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n";
+ − 500 push @{$mbias_read1[0]},$pos;
+ − 501
+ − 502 if ($context eq 'CpG'){
+ − 503 push @{$mbias_read1[1]},$percent;
+ − 504 push @{$mbias_read1[4]},$coverage;
+ − 505 }
+ − 506 elsif ($context eq 'CHG'){
+ − 507 push @{$mbias_read1[2]},$percent;
+ − 508 push @{$mbias_read1[5]},$coverage;
+ − 509 }
+ − 510 elsif ($context eq 'CHH'){
+ − 511 push @{$mbias_read1[3]},$percent;
+ − 512 push @{$mbias_read1[6]},$coverage;
+ − 513 }
+ − 514 }
+ − 515 print MBIAS "\n";
+ − 516 }
+ − 517
+ − 518 if ( $gd_graph_installed){
+ − 519
+ − 520 add_colour(nice_blue => [31,120,180]);
+ − 521 add_colour(nice_orange => [255,127,0]);
+ − 522 add_colour(nice_green => [51,160,44]);
+ − 523 add_colour(pale_blue => [153,206,227]);
+ − 524 add_colour(pale_orange => [253,204,138]);
+ − 525 add_colour(pale_green => [191,230,207]);
+ − 526
+ − 527 $graph1->set(
+ − 528 x_label => 'position (bp)',
+ − 529 y1_label => '% methylation',
+ − 530 y2_label => '# methylation calls',
+ − 531 title => $graph_title,
+ − 532 line_width => 2,
+ − 533 x_max_value => $max_length_1,
+ − 534 x_min_value => 0,
+ − 535 y_tick_number => 10,
+ − 536 y_label_skip => 2,
+ − 537 y1_max_value => 100,
+ − 538 y1_min_value => 0,
+ − 539 y_label_skip => 2,
+ − 540 y2_min_value => 0,
+ − 541 x_label_skip => 5,
+ − 542 x_label_position => 0.5,
+ − 543 x_tick_offset => -1,
+ − 544 bgclr => 'white',
+ − 545 transparent => 0,
+ − 546 two_axes => 1,
+ − 547 use_axis => [1,1,1,2,2,2],
+ − 548 legend_placement => 'RC',
+ − 549 legend_spacing => 6,
+ − 550 legend_marker_width => 24,
+ − 551 legend_marker_height => 18,
+ − 552 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
+ − 553 ) or die $graph1->error;
+ − 554
+ − 555 $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
+ − 556
+ − 557 my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error;
+ − 558
+ − 559 open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n";
+ − 560 binmode MBIAS_G1;
+ − 561 print MBIAS_G1 $gd1->png;
+ − 562 }
+ − 563
+ − 564 if ($paired){
+ − 565
+ − 566 foreach my $context (qw(CpG CHG CHH)){
+ − 567 @{$mbias_read2[0]} = ();
+ − 568
+ − 569 print MBIAS "$context context (R2)\n================\n";
+ − 570 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
+ − 571 foreach my $pos (1..$max_length_2){
+ − 572
+ − 573 unless (defined $mbias_2{$context}->{$pos}->{meth}){
+ − 574 $mbias_2{$context}->{$pos}->{meth} = 0;
+ − 575 }
+ − 576 unless (defined $mbias_2{$context}->{$pos}->{un}){
+ − 577 $mbias_2{$context}->{$pos}->{un} = 0;
+ − 578 }
+ − 579
+ − 580 my $percent = '';
+ − 581 if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){
+ − 582 $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) );
+ − 583 }
+ − 584 my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth};
+ − 585
+ − 586 print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n";
+ − 587
+ − 588 push @{$mbias_read2[0]},$pos;
+ − 589
+ − 590 if ($context eq 'CpG'){
+ − 591 push @{$mbias_read2[1]},$percent;
+ − 592 push @{$mbias_read2[4]},$coverage;
+ − 593 }
+ − 594 elsif ($context eq 'CHG'){
+ − 595 push @{$mbias_read2[2]},$percent;
+ − 596 push @{$mbias_read2[5]},$coverage;
+ − 597 }
+ − 598 elsif ($context eq 'CHH'){
+ − 599 push @{$mbias_read2[3]},$percent;
+ − 600 push @{$mbias_read2[6]},$coverage;
+ − 601 }
+ − 602 }
+ − 603 print MBIAS "\n";
+ − 604 }
+ − 605
+ − 606 if ( $gd_graph_installed){
+ − 607
+ − 608 add_colour(nice_blue => [31,120,180]);
+ − 609 add_colour(nice_orange => [255,127,0]);
+ − 610 add_colour(nice_green => [51,160,44]);
+ − 611 add_colour(pale_blue => [153,206,227]);
+ − 612 add_colour(pale_orange => [253,204,138]);
+ − 613 add_colour(pale_green => [191,230,207]);
+ − 614
+ − 615 $graph2->set(
+ − 616 x_label => 'position (bp)',
+ − 617 line_width => 2,
+ − 618 x_max_value => $max_length_1,
+ − 619 x_min_value => 0,
+ − 620 y_tick_number => 10,
+ − 621 y_label_skip => 2,
+ − 622 y1_max_value => 100,
+ − 623 y1_min_value => 0,
+ − 624 y_label_skip => 2,
+ − 625 y2_min_value => 0,
+ − 626 x_label_skip => 5,
+ − 627 x_label_position => 0.5,
+ − 628 x_tick_offset => -1,
+ − 629 bgclr => 'white',
+ − 630 transparent => 0,
+ − 631 two_axes => 1,
+ − 632 use_axis => [1,1,1,2,2,2],
+ − 633 legend_placement => 'RC',
+ − 634 legend_spacing => 6,
+ − 635 legend_marker_width => 24,
+ − 636 legend_marker_height => 18,
+ − 637 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
+ − 638 x_label => 'position (bp)',
+ − 639 y1_label => '% methylation',
+ − 640 y2_label => '# calls',
+ − 641 title => 'M-bias (Read 2)',
+ − 642 ) or die $graph2->error;
+ − 643
+ − 644 $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
+ − 645 my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error;
+ − 646
+ − 647 open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n";
+ − 648 binmode MBIAS_G2;
+ − 649 print MBIAS_G2 $gd2->png;
+ − 650
+ − 651 }
+ − 652 }
+ − 653 }
0
+ − 654
+ − 655 sub process_commandline{
+ − 656 my $help;
+ − 657 my $single_end;
+ − 658 my $paired_end;
+ − 659 my $ignore;
3
+ − 660 my $ignore_r2;
0
+ − 661 my $genomic_fasta;
+ − 662 my $full;
+ − 663 my $report;
+ − 664 my $extractor_version;
+ − 665 my $no_overlap;
+ − 666 my $merge_non_CpG;
+ − 667 my $vanilla;
+ − 668 my $output_dir;
+ − 669 my $no_header;
+ − 670 my $bedGraph;
+ − 671 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status
+ − 672 my $remove;
+ − 673 my $counts;
+ − 674 my $cytosine_report;
+ − 675 my $genome_folder;
+ − 676 my $zero;
+ − 677 my $CpG_only;
+ − 678 my $CX_context;
+ − 679 my $split_by_chromosome;
+ − 680 my $sort_size;
+ − 681 my $samtools_path;
+ − 682 my $gzip;
3
+ − 683 my $mbias_only;
+ − 684 my $gazillion;
+ − 685 my $ample_mem;
+ − 686
+ − 687 my $command_line = GetOptions ('help|man' => \$help,
+ − 688 'p|paired-end' => \$paired_end,
+ − 689 's|single-end' => \$single_end,
+ − 690 'fasta' => \$genomic_fasta,
+ − 691 'ignore=i' => \$ignore,
+ − 692 'ignore_r2=i' => \$ignore_r2,
+ − 693 'comprehensive' => \$full,
+ − 694 'report' => \$report,
+ − 695 'version' => \$extractor_version,
+ − 696 'no_overlap' => \$no_overlap,
+ − 697 'merge_non_CpG' => \$merge_non_CpG,
+ − 698 'vanilla' => \$vanilla,
+ − 699 'o|output=s' => \$output_dir,
+ − 700 'no_header' => \$no_header,
+ − 701 'bedGraph' => \$bedGraph,
+ − 702 "cutoff=i" => \$coverage_threshold,
+ − 703 "remove_spaces" => \$remove,
+ − 704 "counts" => \$counts,
+ − 705 "cytosine_report" => \$cytosine_report,
+ − 706 'g|genome_folder=s' => \$genome_folder,
+ − 707 "zero_based" => \$zero,
+ − 708 "CX|CX_context" => \$CX_context,
+ − 709 "split_by_chromosome" => \$split_by_chromosome,
+ − 710 "buffer_size=s" => \$sort_size,
+ − 711 'samtools_path=s' => \$samtools_path,
+ − 712 "gzip" => \$gzip,
+ − 713 "mbias_only" => \$mbias_only,
+ − 714 "gazillion|scaffolds" => \$gazillion,
+ − 715 "ample_memory" => \$ample_mem,
+ − 716 );
0
+ − 717
+ − 718 ### EXIT ON ERROR if there were errors with any of the supplied options
+ − 719 unless ($command_line){
+ − 720 die "Please respecify command line options\n";
+ − 721 }
+ − 722
+ − 723 ### HELPFILE
+ − 724 if ($help){
+ − 725 print_helpfile();
+ − 726 exit;
+ − 727 }
+ − 728
+ − 729 if ($extractor_version){
+ − 730 print << "VERSION";
+ − 731
+ − 732
+ − 733 Bismark Methylation Extractor
+ − 734
+ − 735 Bismark Extractor Version: $version
+ − 736 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
+ − 737 www.bioinformatics.babraham.ac.uk/projects/bismark/
+ − 738
+ − 739
+ − 740 VERSION
+ − 741 exit;
+ − 742 }
+ − 743
+ − 744
+ − 745 ### no files provided
+ − 746 unless (@ARGV){
+ − 747 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
+ − 748 }
+ − 749 @filenames = @ARGV;
+ − 750
+ − 751 warn "\n *** Bismark methylation extractor version $version ***\n\n";
+ − 752
3
+ − 753 ### M-BIAS ONLY
+ − 754 if ($mbias_only){
+ − 755 if ($bedGraph){
+ − 756 die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n";
+ − 757 }
+ − 758 if ($cytosine_report){
+ − 759 die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n";
+ − 760 }
+ − 761 if ($merge_non_CpG){
+ − 762 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n";
+ − 763 }
+ − 764 if ($full){
+ − 765 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n";
+ − 766 }
+ − 767 sleep(3);
0
+ − 768 }
3
+ − 769
0
+ − 770 ### PRINT A REPORT
+ − 771 unless ($report){
+ − 772 $report = 0;
+ − 773 }
+ − 774
+ − 775 ### OUTPUT DIR PATH
+ − 776 if ($output_dir){
+ − 777 unless ($output_dir =~ /\/$/){
+ − 778 $output_dir =~ s/$/\//;
+ − 779 }
+ − 780 }
+ − 781 else{
+ − 782 $output_dir = '';
+ − 783 }
+ − 784
+ − 785 ### NO HEADER
+ − 786 unless ($no_header){
+ − 787 $no_header = 0;
+ − 788 }
+ − 789
+ − 790 ### OLD (VANILLA) OUTPUT FORMAT
+ − 791 unless ($vanilla){
+ − 792 $vanilla = 0;
+ − 793 }
+ − 794
+ − 795 if ($single_end){
+ − 796 $paired_end = 0; ### SINGLE END ALIGNMENTS
+ − 797 }
+ − 798 elsif ($paired_end){
+ − 799 $single_end = 0; ### PAIRED-END ALIGNMENTS
+ − 800 }
+ − 801 else{
3
+ − 802
+ − 803 ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header
+ − 804
+ − 805 if ($vanilla){
+ − 806 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n";
+ − 807 }
+ − 808 else{ # SAM/BAM format
+ − 809
+ − 810 my $file = $filenames[0];
+ − 811 warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1);
+ − 812
+ − 813 ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file
+ − 814 if ($file =~ /\.gz$/){
+ − 815 open (DETERMINE,"zcat $file |") or die "Unable to read from gzipped file $file: $!\n";
+ − 816 }
+ − 817 elsif ($file =~ /\.bam$/ || `file -b $file` =~ /^gzip/){
+ − 818 open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM file $file: $!\n";
+ − 819 }
+ − 820 else{
+ − 821 open (DETERMINE,$file) or die "Unable to read from $file: $!\n";
+ − 822 }
+ − 823
+ − 824 while (<DETERMINE>){
+ − 825 last unless (/^\@/);
+ − 826 if ($_ =~ /^\@PG/){
+ − 827 # warn "found the \@PG line:\n";
+ − 828 # warn "$_";
+ − 829
+ − 830 if ($_ =~ /-1/ and $_ =~ /-2/){
+ − 831 warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1);
+ − 832 $paired_end = 1;
+ − 833 $single_end = 0;
+ − 834 }
+ − 835 else{
+ − 836 warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1);
+ − 837 $paired_end = 0;
+ − 838 $single_end = 1;
+ − 839 }
+ − 840 }
+ − 841 }
+ − 842
+ − 843 close DETERMINE or warn $!;
+ − 844
+ − 845 }
0
+ − 846 }
+ − 847
3
+ − 848 ### IGNORING <INT> bases at the start of the read when processing the methylation call string
+ − 849 unless ($ignore){
+ − 850 $ignore = 0;
+ − 851 }
+ − 852
+ − 853 if (defined $ignore_r2){
+ − 854 die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end);
+ − 855 }
+ − 856 else{
+ − 857 $ignore_r2 = 0;
+ − 858 }
+ − 859
+ − 860
0
+ − 861 ### NO OVERLAP
+ − 862 if ($no_overlap){
+ − 863 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
+ − 864 }
+ − 865 else{
+ − 866 $no_overlap = 0;
+ − 867 }
+ − 868
+ − 869 ### COMPREHENSIVE OUTPUT
+ − 870 unless ($full){
+ − 871 $full = 0;
+ − 872 }
+ − 873
+ − 874 ### MERGE NON-CpG context
+ − 875 unless ($merge_non_CpG){
+ − 876 $merge_non_CpG = 0;
+ − 877 }
+ − 878
+ − 879 ### remove white spaces in read ID (needed for sorting using the sort command
+ − 880 unless ($remove){
+ − 881 $remove = 0;
+ − 882 }
+ − 883
+ − 884 ### COVERAGE THRESHOLD FOR bedGraph OUTPUT
+ − 885 if (defined $coverage_threshold){
+ − 886 unless ($coverage_threshold > 0){
+ − 887 die "Please select a coverage greater than 0 (positive integers only)\n";
+ − 888 }
+ − 889 }
+ − 890 else{
+ − 891 $coverage_threshold = 1;
+ − 892 }
+ − 893
+ − 894 ### SORT buffer size
+ − 895 if (defined $sort_size){
+ − 896 unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){
+ − 897 die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n";
+ − 898 }
+ − 899 }
+ − 900 else{
+ − 901 $sort_size = '2G';
+ − 902 }
+ − 903
+ − 904 if ($zero){
+ − 905 die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report);
+ − 906 }
+ − 907
+ − 908 if ($CX_context){
+ − 909 die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
+ − 910 }
+ − 911 else{
+ − 912 $CX_context = 0;
+ − 913 }
+ − 914
+ − 915 unless ($counts){
3
+ − 916 $counts = 1; # counts will always be set
0
+ − 917 }
+ − 918
+ − 919 if ($cytosine_report){
+ − 920
+ − 921 ### GENOME folder
+ − 922 if ($genome_folder){
+ − 923 unless ($genome_folder =~/\/$/){
+ − 924 $genome_folder =~ s/$/\//;
+ − 925 }
+ − 926 }
+ − 927 else{
+ − 928 die "Please specify a genome folder to proceed (full path only)\n";
+ − 929 }
+ − 930
+ − 931 unless ($bedGraph){
+ − 932 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n";
+ − 933 $bedGraph = 1;
+ − 934 }
+ − 935 unless ($counts){
3
+ − 936 # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n";
0
+ − 937 $counts = 1;
+ − 938 }
+ − 939 warn "\n";
+ − 940 }
+ − 941
+ − 942 ### PATH TO SAMTOOLS
+ − 943 if (defined $samtools_path){
+ − 944 # if Samtools was specified as full command
+ − 945 if ($samtools_path =~ /samtools$/){
+ − 946 if (-e $samtools_path){
+ − 947 # Samtools executable found
+ − 948 }
+ − 949 else{
+ − 950 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
+ − 951 }
+ − 952 }
+ − 953 else{
+ − 954 unless ($samtools_path =~ /\/$/){
+ − 955 $samtools_path =~ s/$/\//;
+ − 956 }
+ − 957 $samtools_path .= 'samtools';
+ − 958 if (-e $samtools_path){
+ − 959 # Samtools executable found
+ − 960 }
+ − 961 else{
+ − 962 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
+ − 963 }
+ − 964 }
+ − 965 }
+ − 966 # Check whether Samtools is in the PATH if no path was supplied by the user
+ − 967 else{
+ − 968 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
+ − 969 $samtools_path = `which samtools`;
+ − 970 chomp $samtools_path;
+ − 971 }
+ − 972 }
+ − 973
+ − 974 unless (defined $samtools_path){
+ − 975 $samtools_path = '';
+ − 976 }
+ − 977
3
+ − 978
+ − 979 if ($gazillion){
+ − 980 if ($ample_mem){
+ − 981 die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n";
+ − 982 }
+ − 983 }
+ − 984
+ − 985 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem);
+ − 986 }
+ − 987
+ − 988
+ − 989 sub test_positional_sorting{
+ − 990
+ − 991 my $filename = shift;
+ − 992
+ − 993 print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t";
+ − 994 sleep(1);
+ − 995
+ − 996 if ($filename =~ /\.gz$/) {
+ − 997 open (TEST,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
+ − 998 }
+ − 999 elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) {
+ − 1000 if ($samtools_path){
+ − 1001 open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
+ − 1002 }
+ − 1003 else{
+ − 1004 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
+ − 1005 }
+ − 1006 }
+ − 1007 else {
+ − 1008 open (TEST,$filename) or die "Can't open file $filename: $!\n";
+ − 1009 }
+ − 1010
+ − 1011 my $count = 0;
+ − 1012
+ − 1013 while (<TEST>) {
+ − 1014 if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted)
+ − 1015 if (/^\@SO/) {
+ − 1016 die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n";
+ − 1017 }
+ − 1018 next;
+ − 1019 }
+ − 1020 $count++;
+ − 1021
+ − 1022 last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID
+ − 1023
+ − 1024 my ($id_1) = (split (/\t/));
+ − 1025
+ − 1026 ### reading the next line which should be read 2
+ − 1027 $_ = <TEST>;
+ − 1028 my ($id_2) = (split (/\t/));
+ − 1029 last unless ($id_2);
+ − 1030 ++$count;
+ − 1031
+ − 1032 if ($id_1 eq $id_2){
+ − 1033 ### ids are the same
+ − 1034 next;
+ − 1035 }
+ − 1036 else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first
+ − 1037 my $id_1_trunc = $id_1;
+ − 1038 $id_1_trunc =~ s/\/1$//;
+ − 1039 my $id_2_trunc = $id_2;
+ − 1040 $id_2_trunc =~ s/\/2$//;
+ − 1041
+ − 1042 unless ($id_1_trunc eq $id_2_trunc){
+ − 1043 die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n";
+ − 1044 }
+ − 1045 }
+ − 1046 }
+ − 1047 # close TEST or die $!; somehow fails on our cluster...
+ − 1048 ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other)
+ − 1049 warn "...passed!\n";
+ − 1050 sleep(1);
+ − 1051
0
+ − 1052 }
+ − 1053
+ − 1054
+ − 1055 sub process_Bismark_results_file{
+ − 1056 my $filename = shift;
+ − 1057
+ − 1058 warn "\nNow reading in Bismark result file $filename\n\n";
+ − 1059
+ − 1060 if ($filename =~ /\.gz$/) {
+ − 1061 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
+ − 1062 }
3
+ − 1063 elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) {
0
+ − 1064 if ($samtools_path){
+ − 1065 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
+ − 1066 }
+ − 1067 else{
+ − 1068 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
+ − 1069 }
+ − 1070 }
+ − 1071 else {
+ − 1072 open (IN,$filename) or die "Can't open file $filename: $!\n";
+ − 1073 }
+ − 1074
+ − 1075 ### Vanilla and SAM output need to read different numbers of header lines
+ − 1076 if ($vanilla) {
+ − 1077 my $bismark_version = <IN>; ## discarding the Bismark version info
+ − 1078 chomp $bismark_version;
+ − 1079 $bismark_version =~ s/\r//; # replaces \r line feed
+ − 1080 $bismark_version =~ s/Bismark version: //;
+ − 1081 if ($bismark_version =~ /^\@/) {
+ − 1082 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
+ − 1083 sleep (2);
+ − 1084 }
+ − 1085
+ − 1086 unless ($version eq $bismark_version){
+ − 1087 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
+ − 1088 }
+ − 1089 } else {
+ − 1090 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
+ − 1091 # We are reading from it further down
+ − 1092 }
+ − 1093
+ − 1094 my $output_filename = (split (/\//,$filename))[-1];
+ − 1095
+ − 1096 ### OPENING OUTPUT-FILEHANDLES
+ − 1097 if ($report) {
+ − 1098 my $report_filename = $output_filename;
+ − 1099 $report_filename =~ s/\.sam$//;
+ − 1100 $report_filename =~ s/\.txt$//;
+ − 1101 $report_filename =~ s/$/_splitting_report.txt/;
+ − 1102 $report_filename = $output_dir . $report_filename;
+ − 1103 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
+ − 1104 }
+ − 1105
+ − 1106 if ($report) {
+ − 1107 print REPORT "$output_filename\n\n";
+ − 1108 print REPORT "Parameters used to extract methylation information:\n";
+ − 1109 if ($paired) {
+ − 1110 if ($vanilla) {
+ − 1111 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
+ − 1112 } else {
+ − 1113 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
+ − 1114 }
+ − 1115 }
+ − 1116
+ − 1117 if ($single) {
+ − 1118 if ($vanilla) {
+ − 1119 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
+ − 1120 } else {
+ − 1121 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
+ − 1122 }
+ − 1123 }
3
+ − 1124 if ($single){
+ − 1125 if ($ignore) {
+ − 1126 print REPORT "Ignoring first $ignore bp\n";
+ − 1127 }
+ − 1128 }
+ − 1129 else{ # paired-end
+ − 1130 if ($ignore) {
+ − 1131 print REPORT "Ignoring first $ignore bp of Read 1\n";
+ − 1132 }
+ − 1133 if ($ignore_r2){
+ − 1134 print REPORT "Ignoring first $ignore_r2 bp of Read 2\n";
+ − 1135 }
0
+ − 1136 }
+ − 1137
+ − 1138 if ($full) {
+ − 1139 print REPORT "Output specified: comprehensive\n";
+ − 1140 } else {
+ − 1141 print REPORT "Output specified: strand-specific (default)\n";
+ − 1142 }
+ − 1143
+ − 1144 if ($no_overlap) {
+ − 1145 print REPORT "No overlapping methylation calls specified\n";
+ − 1146 }
+ − 1147 if ($genomic_fasta) {
+ − 1148 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
+ − 1149 }
+ − 1150 if ($merge_non_CpG) {
+ − 1151 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
+ − 1152 }
+ − 1153
+ − 1154 print REPORT "\n";
+ − 1155 }
+ − 1156
+ − 1157 ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
+ − 1158
+ − 1159 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
+ − 1160 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
+ − 1161 if ($full and $merge_non_CpG) {
+ − 1162 my $cpg_output = my $other_c_output = $output_filename;
+ − 1163 ### C in CpG context
+ − 1164 $cpg_output =~ s/^/CpG_context_/;
+ − 1165 $cpg_output =~ s/sam$/txt/;
+ − 1166 $cpg_output =~ s/bam$/txt/;
+ − 1167 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
+ − 1168 $cpg_output = $output_dir . $cpg_output;
+ − 1169
+ − 1170 if ($gzip){
+ − 1171 $cpg_output .= '.gz';
3
+ − 1172 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
0
+ − 1173 }
+ − 1174 else{
3
+ − 1175 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
0
+ − 1176 }
+ − 1177
3
+ − 1178 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
0
+ − 1179 push @sorting_files,$cpg_output;
+ − 1180
+ − 1181 unless ($no_header) {
3
+ − 1182 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1183 }
+ − 1184
+ − 1185 ### C in any other context than CpG
+ − 1186 $other_c_output =~ s/^/Non_CpG_context_/;
+ − 1187 $other_c_output =~ s/sam$/txt/;
+ − 1188 $other_c_output =~ s/bam$/txt/;
+ − 1189 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
+ − 1190 $other_c_output = $output_dir . $other_c_output;
+ − 1191
+ − 1192 if ($gzip){
+ − 1193 $other_c_output .= '.gz';
3
+ − 1194 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only);
0
+ − 1195 }
+ − 1196 else{
3
+ − 1197 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only);
0
+ − 1198 }
+ − 1199
3
+ − 1200 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only);
0
+ − 1201 push @sorting_files,$other_c_output;
+ − 1202
+ − 1203
+ − 1204 unless ($no_header) {
3
+ − 1205 print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1206 }
+ − 1207 }
+ − 1208
+ − 1209 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
+ − 1210 elsif ($merge_non_CpG) {
+ − 1211
+ − 1212 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
+ − 1213
+ − 1214 ### For cytosines in CpG context
+ − 1215 $cpg_ot =~ s/^/CpG_OT_/;
+ − 1216 $cpg_ot =~ s/sam$/txt/;
+ − 1217 $cpg_ot =~ s/bam$/txt/;
+ − 1218 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
+ − 1219 $cpg_ot = $output_dir . $cpg_ot;
+ − 1220
+ − 1221 if ($gzip){
+ − 1222 $cpg_ot .= '.gz';
3
+ − 1223 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
0
+ − 1224 }
+ − 1225 else{
3
+ − 1226 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
0
+ − 1227 }
+ − 1228
3
+ − 1229 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
0
+ − 1230 push @sorting_files,$cpg_ot;
+ − 1231
+ − 1232 unless($no_header){
3
+ − 1233 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1234 }
+ − 1235
+ − 1236 $cpg_ctot =~ s/^/CpG_CTOT_/;
+ − 1237 $cpg_ctot =~ s/sam$/txt/;
+ − 1238 $cpg_ctot =~ s/bam$/txt/;
+ − 1239 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
+ − 1240 $cpg_ctot = $output_dir . $cpg_ctot;
+ − 1241
+ − 1242 if ($gzip){
+ − 1243 $cpg_ctot .= '.gz';
3
+ − 1244 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
0
+ − 1245 }
+ − 1246 else{
3
+ − 1247 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
0
+ − 1248 }
+ − 1249
3
+ − 1250 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
0
+ − 1251 push @sorting_files,$cpg_ctot;
+ − 1252
+ − 1253 unless($no_header){
3
+ − 1254 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1255 }
+ − 1256
+ − 1257 $cpg_ctob =~ s/^/CpG_CTOB_/;
+ − 1258 $cpg_ctob =~ s/sam$/txt/;
+ − 1259 $cpg_ctob =~ s/bam$/txt/;
+ − 1260 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
+ − 1261 $cpg_ctob = $output_dir . $cpg_ctob;
+ − 1262
+ − 1263 if ($gzip){
+ − 1264 $cpg_ctob .= '.gz';
3
+ − 1265 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
0
+ − 1266 }
+ − 1267 else{
3
+ − 1268 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
0
+ − 1269 }
+ − 1270
3
+ − 1271 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
0
+ − 1272 push @sorting_files,$cpg_ctob;
+ − 1273
+ − 1274 unless($no_header){
3
+ − 1275 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1276 }
+ − 1277
+ − 1278 $cpg_ob =~ s/^/CpG_OB_/;
+ − 1279 $cpg_ob =~ s/sam$/txt/;
+ − 1280 $cpg_ob =~ s/bam$/txt/;
+ − 1281 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
+ − 1282 $cpg_ob = $output_dir . $cpg_ob;
+ − 1283
+ − 1284 if ($gzip){
+ − 1285 $cpg_ob .= '.gz';
3
+ − 1286 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
0
+ − 1287 }
+ − 1288 else{
3
+ − 1289 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
0
+ − 1290 }
+ − 1291
3
+ − 1292 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
0
+ − 1293 push @sorting_files,$cpg_ob;
+ − 1294
+ − 1295 unless($no_header){
3
+ − 1296 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1297 }
+ − 1298
+ − 1299 ### For cytosines in Non-CpG (CC, CT or CA) context
+ − 1300 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
+ − 1301
+ − 1302 $other_c_ot =~ s/^/Non_CpG_OT_/;
+ − 1303 $other_c_ot =~ s/sam$/txt/;
+ − 1304 $other_c_ot =~ s/bam$/txt/;
+ − 1305 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
+ − 1306 $other_c_ot = $output_dir . $other_c_ot;
+ − 1307
+ − 1308 if ($gzip){
+ − 1309 $other_c_ot .= '.gz';
3
+ − 1310 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
0
+ − 1311 }
+ − 1312 else{
3
+ − 1313 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
0
+ − 1314 }
+ − 1315
3
+ − 1316 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only);
0
+ − 1317 push @sorting_files,$other_c_ot;
+ − 1318
+ − 1319 unless($no_header){
3
+ − 1320 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1321 }
+ − 1322
+ − 1323 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
+ − 1324 $other_c_ctot =~ s/sam$/txt/;
+ − 1325 $other_c_ctot =~ s/bam$/txt/;
+ − 1326 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
+ − 1327 $other_c_ctot = $output_dir . $other_c_ctot;
+ − 1328
+ − 1329 if ($gzip){
+ − 1330 $other_c_ctot .= '.gz';
3
+ − 1331 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
0
+ − 1332 }
+ − 1333 else{
3
+ − 1334 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
0
+ − 1335 }
+ − 1336
3
+ − 1337 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only);
0
+ − 1338 push @sorting_files,$other_c_ctot;
+ − 1339
+ − 1340 unless($no_header){
3
+ − 1341 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1342 }
+ − 1343
+ − 1344 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
+ − 1345 $other_c_ctob =~ s/sam$/txt/;
+ − 1346 $other_c_ctob =~ s/bam$/txt/;
+ − 1347 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
+ − 1348 $other_c_ctob = $output_dir . $other_c_ctob;
+ − 1349
+ − 1350 if ($gzip){
+ − 1351 $other_c_ctob .= '.gz';
3
+ − 1352 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
0
+ − 1353 }
+ − 1354 else{
3
+ − 1355 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
0
+ − 1356 }
+ − 1357
3
+ − 1358 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only);
0
+ − 1359 push @sorting_files,$other_c_ctob;
+ − 1360
+ − 1361 unless($no_header){
3
+ − 1362 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1363 }
+ − 1364
+ − 1365 $other_c_ob =~ s/^/Non_CpG_OB_/;
+ − 1366 $other_c_ob =~ s/sam$/txt/;
+ − 1367 $other_c_ob =~ s/sam$/txt/;
+ − 1368 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
+ − 1369 $other_c_ob = $output_dir . $other_c_ob;
+ − 1370
+ − 1371 if ($gzip){
+ − 1372 $other_c_ob .= '.gz';
3
+ − 1373 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
0
+ − 1374 }
+ − 1375 else{
3
+ − 1376 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
0
+ − 1377 }
+ − 1378
3
+ − 1379 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only);
0
+ − 1380 push @sorting_files,$other_c_ob;
+ − 1381
+ − 1382 unless($no_header){
3
+ − 1383 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1384 }
+ − 1385 }
+ − 1386 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
+ − 1387
+ − 1388 ### if --comprehensive was specified we are only writing one file per context
+ − 1389 elsif ($full) {
+ − 1390 my $cpg_output = my $chg_output = my $chh_output = $output_filename;
+ − 1391 ### C in CpG context
+ − 1392 $cpg_output =~ s/^/CpG_context_/;
+ − 1393 $cpg_output =~ s/sam$/txt/;
+ − 1394 $cpg_output =~ s/bam$/txt/;
+ − 1395 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
+ − 1396 $cpg_output = $output_dir . $cpg_output;
+ − 1397
+ − 1398 if ($gzip){
+ − 1399 $cpg_output .= '.gz';
3
+ − 1400 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
0
+ − 1401 }
+ − 1402 else{
3
+ − 1403 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
0
+ − 1404 }
+ − 1405
3
+ − 1406 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
0
+ − 1407 push @sorting_files,$cpg_output;
+ − 1408
+ − 1409 unless($no_header){
3
+ − 1410 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1411 }
+ − 1412
+ − 1413 ### C in CHG context
+ − 1414 $chg_output =~ s/^/CHG_context_/;
+ − 1415 $chg_output =~ s/sam$/txt/;
+ − 1416 $chg_output =~ s/bam$/txt/;
+ − 1417 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
+ − 1418 $chg_output = $output_dir . $chg_output;
+ − 1419
+ − 1420 if ($gzip){
+ − 1421 $chg_output .= '.gz';
3
+ − 1422 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only);
0
+ − 1423 }
+ − 1424 else{
3
+ − 1425 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only);
0
+ − 1426 }
+ − 1427
3
+ − 1428 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only);
0
+ − 1429 push @sorting_files,$chg_output;
+ − 1430
+ − 1431 unless($no_header){
3
+ − 1432 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1433 }
+ − 1434
+ − 1435 ### C in CHH context
+ − 1436 $chh_output =~ s/^/CHH_context_/;
+ − 1437 $chh_output =~ s/sam$/txt/;
+ − 1438 $chh_output =~ s/bam$/txt/;
+ − 1439 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
+ − 1440 $chh_output = $output_dir . $chh_output;
+ − 1441
+ − 1442 if ($gzip){
+ − 1443 $chh_output .= '.gz';
3
+ − 1444 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only);
0
+ − 1445 }
+ − 1446 else{
3
+ − 1447 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only);
0
+ − 1448 }
+ − 1449
3
+ − 1450 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only);
0
+ − 1451 push @sorting_files, $chh_output;
+ − 1452
+ − 1453 unless($no_header){
3
+ − 1454 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1455 }
+ − 1456 }
+ − 1457 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
+ − 1458 else {
+ − 1459 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
+ − 1460
+ − 1461 ### For cytosines in CpG context
+ − 1462 $cpg_ot =~ s/^/CpG_OT_/;
+ − 1463 $cpg_ot =~ s/sam$/txt/;
+ − 1464 $cpg_ot =~ s/bam$/txt/;
+ − 1465 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
+ − 1466 $cpg_ot = $output_dir . $cpg_ot;
+ − 1467
+ − 1468 if ($gzip){
+ − 1469 $cpg_ot .= '.gz';
3
+ − 1470 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
0
+ − 1471 }
+ − 1472 else{
3
+ − 1473 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
0
+ − 1474 }
+ − 1475
3
+ − 1476 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
0
+ − 1477 push @sorting_files,$cpg_ot;
+ − 1478
+ − 1479 unless($no_header){
3
+ − 1480 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1481 }
+ − 1482
+ − 1483 $cpg_ctot =~ s/^/CpG_CTOT_/;
+ − 1484 $cpg_ctot =~ s/sam$/txt/;
+ − 1485 $cpg_ctot =~ s/bam$/txt/;
+ − 1486 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
+ − 1487 $cpg_ctot = $output_dir . $cpg_ctot;
+ − 1488
+ − 1489 if ($gzip){
+ − 1490 $cpg_ctot .= '.gz';
3
+ − 1491 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
0
+ − 1492 }
+ − 1493 else{
3
+ − 1494 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
0
+ − 1495 }
+ − 1496
3
+ − 1497 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
0
+ − 1498 push @sorting_files,$cpg_ctot;
+ − 1499
+ − 1500 unless($no_header){
3
+ − 1501 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1502 }
+ − 1503
+ − 1504 $cpg_ctob =~ s/^/CpG_CTOB_/;
+ − 1505 $cpg_ctob =~ s/sam$/txt/;
+ − 1506 $cpg_ctob =~ s/bam$/txt/;
+ − 1507 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
+ − 1508 $cpg_ctob = $output_dir . $cpg_ctob;
+ − 1509
+ − 1510 if ($gzip){
+ − 1511 $cpg_ctob .= '.gz';
3
+ − 1512 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
0
+ − 1513 }
+ − 1514 else{
3
+ − 1515 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
0
+ − 1516 }
+ − 1517
3
+ − 1518 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
0
+ − 1519 push @sorting_files,$cpg_ctob;
+ − 1520
+ − 1521 unless($no_header){
3
+ − 1522 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1523 }
+ − 1524
+ − 1525 $cpg_ob =~ s/^/CpG_OB_/;
+ − 1526 $cpg_ob =~ s/sam$/txt/;
+ − 1527 $cpg_ob =~ s/bam$/txt/;
+ − 1528 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
+ − 1529 $cpg_ob = $output_dir . $cpg_ob;
+ − 1530
+ − 1531 if ($gzip){
+ − 1532 $cpg_ob .= '.gz';
3
+ − 1533 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
0
+ − 1534 }
+ − 1535 else{
3
+ − 1536 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
0
+ − 1537 }
+ − 1538
3
+ − 1539 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
0
+ − 1540 push @sorting_files,$cpg_ob;
+ − 1541
+ − 1542 unless($no_header){
3
+ − 1543 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1544 }
+ − 1545
+ − 1546 ### For cytosines in CHG context
+ − 1547 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
+ − 1548
+ − 1549 $chg_ot =~ s/^/CHG_OT_/;
+ − 1550 $chg_ot =~ s/sam$/txt/;
+ − 1551 $chg_ot =~ s/bam$/txt/;
+ − 1552 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
+ − 1553 $chg_ot = $output_dir . $chg_ot;
+ − 1554
+ − 1555 if ($gzip){
+ − 1556 $chg_ot .= '.gz';
3
+ − 1557 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
0
+ − 1558 }
+ − 1559 else{
3
+ − 1560 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
0
+ − 1561 }
+ − 1562
3
+ − 1563 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only);
0
+ − 1564 push @sorting_files,$chg_ot;
+ − 1565
+ − 1566 unless($no_header){
3
+ − 1567 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1568 }
+ − 1569
+ − 1570 $chg_ctot =~ s/^/CHG_CTOT_/;
+ − 1571 $chg_ctot =~ s/sam$/txt/;
+ − 1572 $chg_ctot =~ s/bam$/txt/;
+ − 1573 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
+ − 1574 $chg_ctot = $output_dir . $chg_ctot;
+ − 1575
+ − 1576 if ($gzip){
+ − 1577 $chg_ctot .= '.gz';
3
+ − 1578 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
0
+ − 1579 }
+ − 1580 else{
3
+ − 1581 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
0
+ − 1582 }
+ − 1583
3
+ − 1584 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only);
0
+ − 1585 push @sorting_files,$chg_ctot;
+ − 1586
+ − 1587 unless($no_header){
3
+ − 1588 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1589 }
+ − 1590
+ − 1591 $chg_ctob =~ s/^/CHG_CTOB_/;
+ − 1592 $chg_ctob =~ s/sam$/txt/;
+ − 1593 $chg_ctob =~ s/bam$/txt/;
+ − 1594 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
+ − 1595 $chg_ctob = $output_dir . $chg_ctob;
+ − 1596
+ − 1597 if ($gzip){
+ − 1598 $chg_ctob .= '.gz';
3
+ − 1599 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
0
+ − 1600 }
+ − 1601 else{
3
+ − 1602 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
0
+ − 1603 }
+ − 1604
3
+ − 1605 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only);
0
+ − 1606 push @sorting_files,$chg_ctob;
+ − 1607
+ − 1608 unless($no_header){
3
+ − 1609 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1610 }
+ − 1611
+ − 1612 $chg_ob =~ s/^/CHG_OB_/;
+ − 1613 $chg_ob =~ s/sam$/txt/;
+ − 1614 $chg_ob =~ s/bam$/txt/;
+ − 1615 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
+ − 1616 $chg_ob = $output_dir . $chg_ob;
+ − 1617
+ − 1618 if ($gzip){
+ − 1619 $chg_ob .= '.gz';
3
+ − 1620 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
0
+ − 1621 }
+ − 1622 else{
3
+ − 1623 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
0
+ − 1624 }
+ − 1625
3
+ − 1626 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only);
0
+ − 1627 push @sorting_files,$chg_ob;
+ − 1628
+ − 1629 unless($no_header){
3
+ − 1630 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1631 }
+ − 1632
+ − 1633 ### For cytosines in CHH context
+ − 1634 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
+ − 1635
+ − 1636 $chh_ot =~ s/^/CHH_OT_/;
+ − 1637 $chh_ot =~ s/sam$/txt/;
+ − 1638 $chh_ot =~ s/bam$/txt/;
+ − 1639 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
+ − 1640 $chh_ot = $output_dir . $chh_ot;
+ − 1641
+ − 1642 if ($gzip){
+ − 1643 $chh_ot .= '.gz';
3
+ − 1644 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
0
+ − 1645 }
+ − 1646 else{
3
+ − 1647 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
0
+ − 1648 }
+ − 1649
3
+ − 1650 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only);
0
+ − 1651 push @sorting_files,$chh_ot;
+ − 1652
+ − 1653 unless($no_header){
3
+ − 1654 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1655 }
+ − 1656
+ − 1657 $chh_ctot =~ s/^/CHH_CTOT_/;
+ − 1658 $chh_ctot =~ s/sam$/txt/;
+ − 1659 $chh_ctot =~ s/bam$/txt/;
+ − 1660 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
+ − 1661 $chh_ctot = $output_dir . $chh_ctot;
+ − 1662
+ − 1663 if ($gzip){
+ − 1664 $chh_ctot .= '.gz';
3
+ − 1665 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
0
+ − 1666 }
+ − 1667 else{
3
+ − 1668 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
0
+ − 1669 }
+ − 1670
3
+ − 1671 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only);
0
+ − 1672 push @sorting_files,$chh_ctot;
+ − 1673
+ − 1674 unless($no_header){
3
+ − 1675 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1676 }
+ − 1677
+ − 1678 $chh_ctob =~ s/^/CHH_CTOB_/;
+ − 1679 $chh_ctob =~ s/sam$/txt/;
+ − 1680 $chh_ctob =~ s/bam$/txt/;
+ − 1681 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
+ − 1682 $chh_ctob = $output_dir . $chh_ctob;
+ − 1683
+ − 1684 if ($gzip){
+ − 1685 $chh_ctob .= '.gz';
3
+ − 1686 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
0
+ − 1687 }
+ − 1688 else{
3
+ − 1689 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
0
+ − 1690 }
+ − 1691
3
+ − 1692 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only);
0
+ − 1693 push @sorting_files,$chh_ctob;
+ − 1694
+ − 1695 unless($no_header){
3
+ − 1696 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1697 }
+ − 1698
+ − 1699 $chh_ob =~ s/^/CHH_OB_/;
+ − 1700 $chh_ob =~ s/sam$/txt/;
+ − 1701 $chh_ob =~ s/bam$/txt/;
+ − 1702 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
+ − 1703 $chh_ob = $output_dir . $chh_ob;
+ − 1704
+ − 1705 if ($gzip){
+ − 1706 $chh_ob .= '.gz';
3
+ − 1707 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
0
+ − 1708 }
+ − 1709 else{
3
+ − 1710 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
0
+ − 1711 }
+ − 1712
3
+ − 1713 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only);
0
+ − 1714 push @sorting_files,$chh_ob;
+ − 1715
+ − 1716 unless($no_header){
3
+ − 1717 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
0
+ − 1718 }
+ − 1719 }
+ − 1720
+ − 1721 my $methylation_call_strings_processed = 0;
+ − 1722 my $line_count = 0;
+ − 1723
+ − 1724 ### proceeding differently now for single-end or paired-end Bismark files
+ − 1725
+ − 1726 ### PROCESSING SINGLE-END RESULT FILES
+ − 1727 if ($single) {
+ − 1728
+ − 1729 ### also proceeding differently now for SAM format or vanilla Bismark format files
+ − 1730 if ($vanilla) { # old vanilla Bismark output format
+ − 1731 while (<IN>) {
+ − 1732 ++$line_count;
+ − 1733 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
+ − 1734
+ − 1735 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
+ − 1736 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
+ − 1737
+ − 1738 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
+ − 1739 ### last position
+ − 1740 chomp $genome_conversion;
+ − 1741
+ − 1742 my $index;
+ − 1743 if ($meth_call) {
+ − 1744
+ − 1745 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
+ − 1746 $index = 0;
+ − 1747 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
+ − 1748 $index = 1;
+ − 1749 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
+ − 1750 $index = 3;
+ − 1751 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
+ − 1752 $index = 2;
+ − 1753 } else {
+ − 1754 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
+ − 1755 }
+ − 1756
+ − 1757 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
+ − 1758 if ($ignore) {
+ − 1759 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
+ − 1760
+ − 1761 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
+ − 1762 if ($strand eq '+') {
+ − 1763 $start += $ignore;
+ − 1764 } elsif ($strand eq '-') {
+ − 1765 $start += length($meth_call)-1; ## $meth_call is already shortened!
+ − 1766 } else {
+ − 1767 die "Alignment did not have proper strand information: $strand\n";
+ − 1768 }
+ − 1769 }
+ − 1770 ### printing out the methylation state of every C in the read
+ − 1771 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
+ − 1772
+ − 1773 ++$methylation_call_strings_processed; # 1 per single-end result
+ − 1774 }
+ − 1775 }
+ − 1776 } else { # processing single-end SAM format (default)
+ − 1777 while (<IN>) {
+ − 1778 ### SAM format can either start with header lines (starting with @) or start with alignments directly
+ − 1779 if (/^\@/) { # skipping header lines (starting with @)
+ − 1780 warn "skipping SAM header line:\t$_";
+ − 1781 next;
+ − 1782 }
+ − 1783
+ − 1784 ++$line_count;
+ − 1785 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
+ − 1786
+ − 1787 # example read in SAM format
+ − 1788 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
+ − 1789 ###
+ − 1790
+ − 1791 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
+ − 1792 # < 0.7.6 $meth_call =~ s/^XM:Z://;
+ − 1793 # < 0.7.6 $read_conversion =~ s/^XR:Z://;
+ − 1794 # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
+ − 1795
+ − 1796 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
+ − 1797
+ − 1798 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
+ − 1799 my $meth_call; ### Thanks to Zachary Zeno for this solution
+ − 1800 my $read_conversion;
+ − 1801 my $genome_conversion;
+ − 1802
+ − 1803 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
+ − 1804 my $tag = $1;
+ − 1805 my $value = $2;
+ − 1806
+ − 1807 if ($tag eq "XM") {
+ − 1808 $meth_call = $value;
+ − 1809 $meth_call =~ s/\r//;
+ − 1810 } elsif ($tag eq "XR") {
+ − 1811 $read_conversion = $value;
+ − 1812 $read_conversion =~ s/\r//;
+ − 1813 } elsif ($tag eq "XG") {
+ − 1814 $genome_conversion = $value;
+ − 1815 $genome_conversion =~ s/\r//;
+ − 1816 }
+ − 1817 }
+ − 1818
+ − 1819 my $strand;
+ − 1820 chomp $genome_conversion;
+ − 1821 # print "$meth_call\n$read_conversion\n$genome_conversion\n";
+ − 1822
+ − 1823 my $index;
+ − 1824 if ($meth_call) {
+ − 1825 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
+ − 1826 $index = 0;
+ − 1827 $strand = '+';
+ − 1828 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
+ − 1829 $index = 1;
+ − 1830 $strand = '-';
+ − 1831 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
+ − 1832 $index = 2;
+ − 1833 $strand = '+';
+ − 1834 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
+ − 1835 $index = 3;
+ − 1836 $strand = '-';
+ − 1837 } else {
+ − 1838 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
+ − 1839 }
+ − 1840
+ − 1841 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
+ − 1842 if ($strand eq '-') {
+ − 1843 $meth_call = reverse $meth_call;
+ − 1844 }
+ − 1845
+ − 1846 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
+ − 1847 if ($ignore) {
+ − 1848 # print "\n\n$meth_call\n";
+ − 1849 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
+ − 1850 # print "$meth_call\n";
3
+ − 1851
0
+ − 1852 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
+ − 1853
+ − 1854 my @len = split (/\D+/,$cigar); # storing the length per operation
+ − 1855 my @ops = split (/\d+/,$cigar); # storing the operation
+ − 1856 shift @ops; # remove the empty first element
+ − 1857 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+ − 1858
+ − 1859 my @comp_cigar; # building an array with all CIGAR operations
+ − 1860 foreach my $index (0..$#len) {
+ − 1861 foreach (1..$len[$index]) {
+ − 1862 # print "$ops[$index]";
+ − 1863 push @comp_cigar, $ops[$index];
+ − 1864 }
+ − 1865 }
+ − 1866 # print "original CIGAR: $cigar\n";
+ − 1867 # print "original CIGAR: @comp_cigar\n";
+ − 1868
+ − 1869 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
+ − 1870 if ($strand eq '+') {
+ − 1871
+ − 1872 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
+ − 1873 my $I_count = 0;
+ − 1874
+ − 1875 for (1..$ignore) {
+ − 1876 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
+ − 1877 # print "$_ deleted $op\n";
+ − 1878
+ − 1879 while ($op eq 'D') { # repeating this for deletions (D)
+ − 1880 $D_count++;
+ − 1881 $op = shift @comp_cigar;
+ − 1882 # print "$_ deleted $op\n";
+ − 1883 }
+ − 1884 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
+ − 1885 $I_count++;
+ − 1886 }
+ − 1887 }
+ − 1888 $start += $ignore + $D_count - $I_count;
+ − 1889 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
+ − 1890 } elsif ($strand eq '-') {
+ − 1891
+ − 1892 for (1..$ignore) {
+ − 1893 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
+ − 1894 while ($op eq 'D') { # repeating this for deletions (D)
+ − 1895 $op = pop @comp_cigar;
+ − 1896 }
+ − 1897 }
+ − 1898
+ − 1899 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
+ − 1900 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
+ − 1901 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
+ − 1902 foreach (@comp_cigar) {
+ − 1903 ++$MD_count if ($_ eq 'M' or $_ eq 'D');
+ − 1904 }
+ − 1905 $start += $MD_count - 1;
+ − 1906 }
+ − 1907
+ − 1908 ### reconstituting shortened CIGAR string
+ − 1909 my $new_cigar;
+ − 1910 my $count = 0;
+ − 1911 my $last_op;
+ − 1912 # print "ignore adjusted: @comp_cigar\n";
+ − 1913 foreach my $op (@comp_cigar) {
+ − 1914 unless (defined $last_op){
+ − 1915 $last_op = $op;
+ − 1916 ++$count;
+ − 1917 next;
+ − 1918 }
+ − 1919 if ($last_op eq $op) {
+ − 1920 ++$count;
+ − 1921 } else {
+ − 1922 $new_cigar .= "$count$last_op";
+ − 1923 $last_op = $op;
+ − 1924 $count = 1;
+ − 1925 }
+ − 1926 }
+ − 1927 $new_cigar .= "$count$last_op"; # appending the last operation and count
+ − 1928 $cigar = $new_cigar;
+ − 1929 # print "ignore adjusted scalar: $cigar\n";
+ − 1930 }
+ − 1931 }
+ − 1932 ### printing out the methylation state of every C in the read
+ − 1933 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
+ − 1934
+ − 1935 ++$methylation_call_strings_processed; # 1 per single-end result
+ − 1936 }
+ − 1937 }
+ − 1938 }
+ − 1939
+ − 1940 ### PROCESSING PAIRED-END RESULT FILES
+ − 1941 elsif ($paired) {
+ − 1942
+ − 1943 ### proceeding differently now for SAM format or vanilla Bismark format files
+ − 1944 if ($vanilla) { # old vanilla Bismark paired-end output format
+ − 1945 while (<IN>) {
+ − 1946 ++$line_count;
+ − 1947 warn "processed line: $line_count\n" if ($line_count%500000==0);
+ − 1948
+ − 1949 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
+ − 1950 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
+ − 1951
+ − 1952 my $index;
+ − 1953 chomp $genome_conversion;
+ − 1954
+ − 1955 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
+ − 1956 $index = 0; ## this is OT
+ − 1957 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
+ − 1958 $index = 2; ## this is CTOB!!!
+ − 1959 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
+ − 1960 $index = 1; ## this is CTOT!!!
+ − 1961 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
+ − 1962 $index = 3; ## this is OB
+ − 1963 } else {
+ − 1964 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
+ − 1965 }
+ − 1966
+ − 1967 if ($meth_call_1 and $meth_call_2) {
+ − 1968 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
3
+ − 1969
0
+ − 1970 if ($ignore) {
+ − 1971 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
3
+ − 1972
0
+ − 1973 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
+ − 1974 $start_read_1 += $ignore;
+ − 1975 }
3
+ − 1976 if ($ignore_r2) {
+ − 1977 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
+ − 1978
+ − 1979 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified
+ − 1980 $end_read_2 -= $ignore_r2;
+ − 1981 }
+ − 1982
0
+ − 1983 my $end_read_1;
+ − 1984 my $start_read_2;
+ − 1985
+ − 1986 if ($strand eq '+') {
+ − 1987
+ − 1988 $end_read_1 = $start_read_1+length($meth_call_1)-1;
+ − 1989 $start_read_2 = $end_read_2-length($meth_call_2)+1;
3
+ − 1990
0
+ − 1991 ## we first pass the first read which is in + orientation on the forward strand
3
+ − 1992 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity
0
+ − 1993
+ − 1994 # we next pass the second read which is in - orientation on the reverse strand
+ − 1995 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
3
+ − 1996 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2);
+ − 1997 }
+ − 1998 else {
0
+ − 1999
+ − 2000 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
+ − 2001 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
+ − 2002
+ − 2003 ## we first pass the first read which is in - orientation on the reverse strand
3
+ − 2004 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1);
0
+ − 2005
+ − 2006 # we next pass the second read which is in + orientation on the forward strand
+ − 2007 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
3
+ − 2008 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2);
0
+ − 2009 }
+ − 2010
+ − 2011 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
+ − 2012 }
+ − 2013 }
3
+ − 2014 }
+ − 2015 else { # Bismark paired-end SAM output format (default)
0
+ − 2016 while (<IN>) {
+ − 2017 ### SAM format can either start with header lines (starting with @) or start with alignments directly
+ − 2018 if (/^\@/) { # skipping header lines (starting with @)
+ − 2019 warn "skipping SAM header line:\t$_";
+ − 2020 next;
+ − 2021 }
+ − 2022
+ − 2023 ++$line_count;
+ − 2024 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
+ − 2025
+ − 2026 # example paired-end reads in SAM format (2 consecutive lines)
+ − 2027 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
+ − 2028 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
+ − 2029
+ − 2030 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
+ − 2031
+ − 2032 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
+ − 2033 my $meth_call_1;
+ − 2034 my $first_read_conversion;
+ − 2035 my $genome_conversion;
+ − 2036
+ − 2037 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
+ − 2038 my $tag = $1;
+ − 2039 my $value = $2;
+ − 2040
+ − 2041 if ($tag eq "XM") {
+ − 2042 $meth_call_1 = $value;
+ − 2043 $meth_call_1 =~ s/\r//;
+ − 2044 } elsif ($tag eq "XR") {
+ − 2045 $first_read_conversion = $value;
+ − 2046 $first_read_conversion =~ s/\r//;
+ − 2047 } elsif ($tag eq "XG") {
+ − 2048 $genome_conversion = $value;
+ − 2049 $genome_conversion =~ s/\r//;
+ − 2050 }
+ − 2051 }
+ − 2052
+ − 2053 $_ = <IN>; # reading in the paired read
+ − 2054
+ − 2055 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14];
+ − 2056 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://;
+ − 2057 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://;
+ − 2058 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://;
+ − 2059 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://;
+ − 2060
+ − 2061 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
+ − 2062
+ − 2063 my $meth_call_2;
+ − 2064 my $second_read_conversion;
+ − 2065
+ − 2066 while ( /(XM|XR):Z:([^\t]+)/g ) {
+ − 2067 my $tag = $1;
+ − 2068 my $value = $2;
+ − 2069
+ − 2070 if ($tag eq "XM") {
+ − 2071 $meth_call_2 = $value;
+ − 2072 $meth_call_2 =~ s/\r//;
+ − 2073 } elsif ($tag eq "XR") {
+ − 2074 $second_read_conversion = $value;
+ − 2075 $second_read_conversion = s/\r//;
+ − 2076 }
+ − 2077 }
+ − 2078
+ − 2079 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
+ − 2080 chomp $genome_conversion; # in case it captured a new line character
+ − 2081
+ − 2082 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
+ − 2083
+ − 2084 my $index;
+ − 2085 my $strand;
+ − 2086
+ − 2087 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
+ − 2088 $index = 0; ## this is OT
+ − 2089 $strand = '+';
+ − 2090 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
+ − 2091 $index = 1; ## this is CTOT
+ − 2092 $strand = '-';
+ − 2093 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
+ − 2094 $index = 2; ## this is CTOB
+ − 2095 $strand = '+';
+ − 2096 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
+ − 2097 $index = 3; ## this is OB
+ − 2098 $strand = '-';
+ − 2099 } else {
+ − 2100 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
+ − 2101 }
+ − 2102
+ − 2103 ### reversing the methylation call of the read that was reverse-complemented
+ − 2104 if ($strand eq '+') {
+ − 2105 $meth_call_2 = reverse $meth_call_2;
+ − 2106 } else {
+ − 2107 $meth_call_1 = reverse $meth_call_1;
+ − 2108 }
+ − 2109
+ − 2110 if ($meth_call_1 and $meth_call_2) {
+ − 2111
+ − 2112 my $end_read_1;
+ − 2113
+ − 2114 ### READ 1
+ − 2115 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
+ − 2116 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
+ − 2117 shift @ops_1; # remove the empty first element
3
+ − 2118
+ − 2119 die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1);
0
+ − 2120
+ − 2121 my @comp_cigar_1; # building an array with all CIGAR operations
+ − 2122 foreach my $index (0..$#len_1) {
+ − 2123 foreach (1..$len_1[$index]) {
+ − 2124 # print "$ops_1[$index]";
+ − 2125 push @comp_cigar_1, $ops_1[$index];
+ − 2126 }
+ − 2127 }
+ − 2128 # print "original CIGAR read 1: $cigar_1\n";
+ − 2129 # print "original CIGAR read 1: @comp_cigar_1\n";
+ − 2130
+ − 2131 ### READ 2
+ − 2132 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
+ − 2133 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
+ − 2134 shift @ops_2; # remove the empty first element
+ − 2135 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
+ − 2136 my @comp_cigar_2; # building an array with all CIGAR operations for read 2
+ − 2137 foreach my $index (0..$#len_2) {
+ − 2138 foreach (1..$len_2[$index]) {
+ − 2139 # print "$ops_2[$index]";
+ − 2140 push @comp_cigar_2, $ops_2[$index];
+ − 2141 }
+ − 2142 }
+ − 2143 # print "original CIGAR read 2: $cigar_2\n";
+ − 2144 # print "original CIGAR read 2: @comp_cigar_2\n";
+ − 2145
3
+ − 2146
+ − 2147
0
+ − 2148 if ($ignore) {
3
+ − 2149 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' for read 1
0
+ − 2150 ### the methylation calls have already been reversed where necessary
+ − 2151 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
+ − 2152
+ − 2153 if ($strand eq '+') {
+ − 2154
+ − 2155 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
+ − 2156 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
+ − 2157 my $I_count_1 = 0;
3
+ − 2158
0
+ − 2159 for (1..$ignore) {
+ − 2160 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
+ − 2161 # print "$_ deleted $op\n";
+ − 2162
+ − 2163 while ($op eq 'D') { # repeating this for deletions (D)
+ − 2164 $D_count_1++;
+ − 2165 $op = shift @comp_cigar_1;
+ − 2166 # print "$_ deleted $op\n";
+ − 2167 }
+ − 2168 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
+ − 2169 $I_count_1++;
+ − 2170 }
+ − 2171 }
+ − 2172
+ − 2173 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
+ − 2174 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
3
+ − 2175
0
+ − 2176 # the start position of reads mapping to the reverse strand is being adjusted further below
3
+ − 2177 }
+ − 2178 elsif ($strand eq '-') {
0
+ − 2179
+ − 2180 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
+ − 2181 for (1..$ignore) {
+ − 2182 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
+ − 2183 while ($op eq 'D') { # repeating this for deletions (D)
+ − 2184 $op = pop @comp_cigar_1;
+ − 2185 }
+ − 2186 }
+ − 2187 # the start position of reads mapping to the reverse strand is being adjusted further below
+ − 2188
3
+ − 2189 }
+ − 2190 }
+ − 2191
+ − 2192 if ($ignore_r2) {
+ − 2193 ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore_r2 <int>' for read 2
+ − 2194 ### the methylation calls have already been reversed where necessary
+ − 2195 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
+ − 2196
+ − 2197 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
+ − 2198
+ − 2199 if ($strand eq '+') {
+ − 2200
+ − 2201 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
+ − 2202
+ − 2203 for (1..$ignore_r2) {
+ − 2204 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
+ − 2205 while ($op eq 'D') { # repeating this for deletions (D)
+ − 2206 $op = pop @comp_cigar_2;
+ − 2207 }
+ − 2208 }
+ − 2209 # the start position of reads mapping to the reverse strand is being adjusted further below
+ − 2210 }
+ − 2211 elsif ($strand eq '-') {
+ − 2212
0
+ − 2213 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
+ − 2214 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
3
+ − 2215 my $I_count_2 = 0;
+ − 2216
+ − 2217 for (1..$ignore_r2) {
0
+ − 2218 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
+ − 2219 # print "$_ deleted $op\n";
+ − 2220
+ − 2221 while ($op eq 'D') { # repeating this for deletions (D)
+ − 2222 $D_count_2++;
+ − 2223 $op = shift @comp_cigar_2;
+ − 2224 # print "$_ deleted $op\n";
+ − 2225 }
+ − 2226 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
+ − 2227 $I_count_2++;
+ − 2228 }
+ − 2229 }
+ − 2230
3
+ − 2231 $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2;
+ − 2232 # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
+ − 2233 }
+ − 2234 }
0
+ − 2235
3
+ − 2236 if ($ignore){
0
+ − 2237 ### reconstituting shortened CIGAR string 1
+ − 2238 my $new_cigar_1;
+ − 2239 my $count_1 = 0;
+ − 2240 my $last_op_1;
+ − 2241 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
+ − 2242 foreach my $op (@comp_cigar_1) {
+ − 2243 unless (defined $last_op_1){
+ − 2244 $last_op_1 = $op;
+ − 2245 ++$count_1;
+ − 2246 next;
+ − 2247 }
+ − 2248 if ($last_op_1 eq $op) {
+ − 2249 ++$count_1;
+ − 2250 } else {
+ − 2251 $new_cigar_1 .= "$count_1$last_op_1";
+ − 2252 $last_op_1 = $op;
+ − 2253 $count_1 = 1;
+ − 2254 }
+ − 2255 }
+ − 2256 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
+ − 2257 $cigar_1 = $new_cigar_1;
+ − 2258 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
3
+ − 2259 }
+ − 2260
+ − 2261 if ($ignore_r2){
0
+ − 2262
+ − 2263 ### reconstituting shortened CIGAR string 2
+ − 2264 my $new_cigar_2;
+ − 2265 my $count_2 = 0;
+ − 2266 my $last_op_2;
+ − 2267 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
+ − 2268 foreach my $op (@comp_cigar_2) {
+ − 2269 unless (defined $last_op_2){
+ − 2270 $last_op_2 = $op;
+ − 2271 ++$count_2;
+ − 2272 next;
+ − 2273 }
+ − 2274 if ($last_op_2 eq $op) {
+ − 2275 ++$count_2;
3
+ − 2276 }
+ − 2277 else {
0
+ − 2278 $new_cigar_2 .= "$count_2$last_op_2";
+ − 2279 $last_op_2 = $op;
+ − 2280 $count_2 = 1;
+ − 2281 }
+ − 2282 }
+ − 2283 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
+ − 2284 $cigar_2 = $new_cigar_2;
3
+ − 2285 # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n";
0
+ − 2286 }
+ − 2287
3
+ − 2288 ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on
+ − 2289
0
+ − 2290 if ($strand eq '+') {
+ − 2291 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
+ − 2292 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
+ − 2293 # print "reverse: @comp_cigar_2\n";
+ − 2294
+ − 2295 my $MD_count_1 = 0;
+ − 2296 foreach (@comp_cigar_1) {
+ − 2297 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
+ − 2298 }
+ − 2299
+ − 2300 my $MD_count_2 = 0;
+ − 2301 foreach (@comp_cigar_2) {
+ − 2302 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
+ − 2303 }
+ − 2304
+ − 2305 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
+ − 2306 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
3
+ − 2307 }
+ − 2308 else {
0
+ − 2309 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
+ − 2310
+ − 2311 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
+ − 2312 # print "reverse: @comp_cigar_1\n";
+ − 2313
+ − 2314 my $MD_count_1 = 0;
+ − 2315 foreach (@comp_cigar_1) {
+ − 2316 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
+ − 2317 }
+ − 2318
+ − 2319 $end_read_1 = $start_read_1;
+ − 2320 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
+ − 2321 }
+ − 2322
+ − 2323 if ($strand eq '+') {
3
+ − 2324 ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity
+ − 2325 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1);
0
+ − 2326
+ − 2327 # we next pass the second read which is in - orientation on the reverse strand
+ − 2328 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
3
+ − 2329 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2);
0
+ − 2330 } else {
+ − 2331 ## we first pass the first read which is in - orientation on the reverse strand
3
+ − 2332 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1);
0
+ − 2333
+ − 2334 # we next pass the second read which is in + orientation on the forward strand
+ − 2335 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
3
+ − 2336 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2);
0
+ − 2337 }
+ − 2338
+ − 2339 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
+ − 2340 }
+ − 2341 }
+ − 2342 }
+ − 2343 } else {
+ − 2344 die "Single-end or paired-end reads not specified properly\n";
+ − 2345 }
+ − 2346
3
+ − 2347 warn "\n\nProcessed $line_count lines from $filename in total\n";
+ − 2348 warn "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
0
+ − 2349 if ($report) {
3
+ − 2350 print REPORT "\n\nProcessed $line_count lines from $filename in total\n";
0
+ − 2351 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
+ − 2352 }
+ − 2353 print_splitting_report ();
+ − 2354 }
+ − 2355
+ − 2356
+ − 2357
+ − 2358 sub print_splitting_report{
+ − 2359
+ − 2360 ### Calculating methylation percentages if applicable
+ − 2361
+ − 2362 my $percent_meCpG;
+ − 2363 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
+ − 2364 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
+ − 2365 }
+ − 2366
+ − 2367 my $percent_meCHG;
+ − 2368 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
+ − 2369 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
+ − 2370 }
+ − 2371
+ − 2372 my $percent_meCHH;
+ − 2373 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
+ − 2374 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
+ − 2375 }
+ − 2376
+ − 2377 my $percent_non_CpG_methylation;
+ − 2378 if ($merge_non_CpG){
+ − 2379 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
+ − 2380 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
+ − 2381 }
+ − 2382 }
+ − 2383
+ − 2384 if ($report){
+ − 2385 ### detailed information about Cs analysed
+ − 2386 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
+ − 2387
+ − 2388 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
+ − 2389 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
+ − 2390
+ − 2391 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ − 2392 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ − 2393 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
+ − 2394
+ − 2395 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ − 2396 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ − 2397 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+ − 2398
+ − 2399 ### calculating methylated CpG percentage if applicable
+ − 2400 if ($percent_meCpG){
+ − 2401 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
+ − 2402 }
+ − 2403 else{
+ − 2404 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ − 2405 }
+ − 2406
+ − 2407 ### 2-Context Output
+ − 2408 if ($merge_non_CpG){
+ − 2409 if ($percent_non_CpG_methylation){
+ − 2410 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
+ − 2411 }
+ − 2412 else{
+ − 2413 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
+ − 2414 }
+ − 2415 }
+ − 2416
+ − 2417 ### 3 Context Output
+ − 2418 else{
+ − 2419 ### calculating methylated CHG percentage if applicable
+ − 2420 if ($percent_meCHG){
+ − 2421 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
+ − 2422 }
+ − 2423 else{
+ − 2424 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ − 2425 }
+ − 2426
+ − 2427 ### calculating methylated CHH percentage if applicable
+ − 2428 if ($percent_meCHH){
+ − 2429 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+ − 2430 }
+ − 2431 else{
+ − 2432 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
+ − 2433 }
+ − 2434 }
+ − 2435 }
+ − 2436
+ − 2437 ### detailed information about Cs analysed for on-screen report
+ − 2438 print "Final Cytosine Methylation Report\n",'='x33,"\n";
+ − 2439
+ − 2440 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
+ − 2441 print "Total number of C's analysed:\t$total_number_of_C\n\n";
+ − 2442
+ − 2443 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ − 2444 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ − 2445 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
+ − 2446
+ − 2447 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ − 2448 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ − 2449 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+ − 2450
+ − 2451 ### printing methylated CpG percentage if applicable
+ − 2452 if ($percent_meCpG){
+ − 2453 print "C methylated in CpG context:\t${percent_meCpG}%\n";
+ − 2454 }
+ − 2455 else{
+ − 2456 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ − 2457 }
+ − 2458
+ − 2459 ### 2-Context Output
+ − 2460 if ($merge_non_CpG){
+ − 2461 if ($percent_non_CpG_methylation){
+ − 2462 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
+ − 2463 }
+ − 2464 else{
+ − 2465 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
+ − 2466 }
+ − 2467 }
+ − 2468
+ − 2469 ### 3-Context Output
+ − 2470 else{
+ − 2471 ### printing methylated CHG percentage if applicable
+ − 2472 if ($percent_meCHG){
+ − 2473 print "C methylated in CHG context:\t${percent_meCHG}%\n";
+ − 2474 }
+ − 2475 else{
+ − 2476 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ − 2477 }
+ − 2478
+ − 2479 ### printing methylated CHH percentage if applicable
+ − 2480 if ($percent_meCHH){
+ − 2481 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+ − 2482 }
+ − 2483 else{
+ − 2484 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
+ − 2485 }
+ − 2486 }
+ − 2487 }
+ − 2488
+ − 2489
+ − 2490
+ − 2491 sub print_individual_C_methylation_states_paired_end_files{
+ − 2492
3
+ − 2493 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_;
+ − 2494
+ − 2495 ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2
+ − 2496 die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2);
+ − 2497
0
+ − 2498 my @methylation_calls = split(//,$meth_call);
+ − 2499
+ − 2500 #################################################################
+ − 2501 ### . for bases not involving cytosines ###
+ − 2502 ### X for methylated C in CHG context (was protected) ###
+ − 2503 ### x for not methylated C in CHG context (was converted) ###
+ − 2504 ### H for methylated C in CHH context (was protected) ###
+ − 2505 ### h for not methylated C in CHH context (was converted) ###
+ − 2506 ### Z for methylated C in CpG context (was protected) ###
+ − 2507 ### z for not methylated C in CpG context (was converted) ###
3
+ − 2508 ### U for methylated C in Unknown context (was protected) ###
+ − 2509 ### u for not methylated C in Unknown context (was converted) ###
0
+ − 2510 #################################################################
+ − 2511
+ − 2512 my $methyl_CHG_count = 0;
+ − 2513 my $methyl_CHH_count = 0;
+ − 2514 my $methyl_CpG_count = 0;
+ − 2515 my $unmethylated_CHG_count = 0;
+ − 2516 my $unmethylated_CHH_count = 0;
+ − 2517 my $unmethylated_CpG_count = 0;
+ − 2518
+ − 2519 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
+ − 2520 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
+ − 2521 my @comp_cigar;
+ − 2522
+ − 2523 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
+ − 2524 if ($cigar =~ /^\d+M$/){
3
+ − 2525 # this check speeds up the extraction process by up to 60%!!!
0
+ − 2526 }
+ − 2527 else{ # parsing CIGAR string
+ − 2528 my @len;
+ − 2529 my @ops;
+ − 2530 @len = split (/\D+/,$cigar); # storing the length per operation
+ − 2531 @ops = split (/\d+/,$cigar); # storing the operation
+ − 2532 shift @ops; # remove the empty first element
3
+ − 2533
0
+ − 2534 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+ − 2535
+ − 2536 foreach my $index (0..$#len){
+ − 2537 foreach (1..$len[$index]){
+ − 2538 # print "$ops[$index]";
+ − 2539 push @comp_cigar, $ops[$index];
+ − 2540 }
+ − 2541 }
+ − 2542 # warn "\nDetected CIGAR string: $cigar\n";
+ − 2543 # warn "Length of methylation call: ",length $meth_call,"\n";
+ − 2544 # warn "number of operations: ",scalar @ops,"\n";
+ − 2545 # warn "number of length digits: ",scalar @len,"\n\n";
+ − 2546 # print @comp_cigar,"\n";
+ − 2547 # print "$meth_call\n\n";
+ − 2548 # sleep (1);
+ − 2549 }
+ − 2550
+ − 2551 if ($strand eq '-') {
+ − 2552
+ − 2553 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
+ − 2554 if (@comp_cigar){
+ − 2555 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
+ − 2556 }
+ − 2557 # print "reverse CIGAR string: @comp_cigar\n";
+ − 2558
+ − 2559 ### the start position of paired-end files has already been corrected, see above
+ − 2560 }
+ − 2561
+ − 2562 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
+ − 2563
+ − 2564 if ($merge_non_CpG) {
3
+ − 2565 if ($no_overlap) { # this has to be read 2...
0
+ − 2566
+ − 2567 ### single-file CpG and non-CpG context output
+ − 2568 if ($full) {
+ − 2569 if ($strand eq '+') {
+ − 2570 for my $index (0..$#methylation_calls) {
+ − 2571
+ − 2572 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 2573 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2574 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 2575 $cigar_offset += $cigar_mod;
+ − 2576 $pos_offset += $pos_mod;
+ − 2577 }
+ − 2578
+ − 2579 ### Returning as soon as the methylation calls start overlapping
+ − 2580 if ($start+$index+$pos_offset >= $end_read_1) {
+ − 2581 return;
+ − 2582 }
+ − 2583
+ − 2584 if ($methylation_calls[$index] eq 'X') {
+ − 2585 $counting{total_meCHG_count}++;
3
+ − 2586 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2587 if ($read_identity == 1){
+ − 2588 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 2589 }
+ − 2590 else{
+ − 2591 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 2592 }
+ − 2593 }
+ − 2594 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 2595 $counting{total_unmethylated_CHG_count}++;
3
+ − 2596 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2597 if ($read_identity == 1){
+ − 2598 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 2599 }
+ − 2600 else{
+ − 2601 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 2602 }
+ − 2603 }
+ − 2604 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 2605 $counting{total_meCpG_count}++;
3
+ − 2606 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2607 if ($read_identity == 1){
+ − 2608 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 2609 }
+ − 2610 else{
+ − 2611 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 2612 }
+ − 2613 }
+ − 2614 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 2615 $counting{total_unmethylated_CpG_count}++;
3
+ − 2616 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2617 if ($read_identity == 1){
+ − 2618 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 2619 }
+ − 2620 else{
+ − 2621 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 2622 }
+ − 2623 }
+ − 2624 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 2625 $counting{total_meCHH_count}++;
3
+ − 2626 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2627 if ($read_identity == 1){
+ − 2628 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 2629 }
+ − 2630 else{
+ − 2631 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 2632 }
+ − 2633 }
+ − 2634 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 2635 $counting{total_unmethylated_CHH_count}++;
3
+ − 2636 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2637 if ($read_identity == 1){
+ − 2638 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 2639 }
+ − 2640 else{
+ − 2641 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 2642 }
0
+ − 2643 }
+ − 2644 elsif ($methylation_calls[$index] eq '.'){}
3
+ − 2645 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 2646 else{
3
+ − 2647 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
0
+ − 2648 }
+ − 2649 }
3
+ − 2650 }
+ − 2651 elsif ($strand eq '-') {
0
+ − 2652 for my $index (0..$#methylation_calls) {
+ − 2653
+ − 2654 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 2655 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 2656 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2657 $cigar_offset += $cigar_mod;
+ − 2658 $pos_offset += $pos_mod;
+ − 2659 }
+ − 2660
+ − 2661 ### Returning as soon as the methylation calls start overlapping
+ − 2662 if ($start-$index+$pos_offset <= $end_read_1) {
+ − 2663 return;
+ − 2664 }
3
+ − 2665
0
+ − 2666 if ($methylation_calls[$index] eq 'X') {
+ − 2667 $counting{total_meCHG_count}++;
3
+ − 2668 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2669 if ($read_identity == 1){
+ − 2670 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 2671 }
+ − 2672 else{
+ − 2673 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 2674 }
+ − 2675 }
+ − 2676 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 2677 $counting{total_unmethylated_CHG_count}++;
3
+ − 2678 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2679 if ($read_identity == 1){
+ − 2680 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 2681 }
+ − 2682 else{
+ − 2683 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 2684 }
+ − 2685 }
+ − 2686 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 2687 $counting{total_meCpG_count}++;
3
+ − 2688 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2689 if ($read_identity == 1){
+ − 2690 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 2691 }
+ − 2692 else{
+ − 2693 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 2694 }
+ − 2695 }
+ − 2696 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 2697 $counting{total_unmethylated_CpG_count}++;
3
+ − 2698 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2699 if ($read_identity == 1){
+ − 2700 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 2701 }
+ − 2702 else{
+ − 2703 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 2704 }
+ − 2705 }
+ − 2706 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 2707 $counting{total_meCHH_count}++;
3
+ − 2708 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2709 if ($read_identity == 1){
+ − 2710 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 2711 }
+ − 2712 else{
+ − 2713 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 2714 }
+ − 2715 }
+ − 2716 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 2717 $counting{total_unmethylated_CHH_count}++;
3
+ − 2718 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2719 if ($read_identity == 1){
+ − 2720 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 2721 }
+ − 2722 else{
+ − 2723 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 2724 }
0
+ − 2725 }
+ − 2726 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 2727 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 2728 else{
3
+ − 2729 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
0
+ − 2730 }
+ − 2731 }
+ − 2732 } else {
+ − 2733 die "The read orientation was neither + nor -: '$strand'\n";
+ − 2734 }
+ − 2735 }
+ − 2736
+ − 2737 ### strand-specific methylation output
+ − 2738 else {
+ − 2739 if ($strand eq '+') {
+ − 2740 for my $index (0..$#methylation_calls) {
+ − 2741
+ − 2742 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 2743 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2744 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 2745 $cigar_offset += $cigar_mod;
+ − 2746 $pos_offset += $pos_mod;
+ − 2747 }
+ − 2748
+ − 2749 ### Returning as soon as the methylation calls start overlapping
+ − 2750 if ($start+$index+$pos_offset >= $end_read_1) {
+ − 2751 return;
+ − 2752 }
+ − 2753
+ − 2754 if ($methylation_calls[$index] eq 'X') {
+ − 2755 $counting{total_meCHG_count}++;
3
+ − 2756 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2757 if ($read_identity == 1){
+ − 2758 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 2759 }
+ − 2760 else{
+ − 2761 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 2762 }
+ − 2763 }
+ − 2764 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 2765 $counting{total_unmethylated_CHG_count}++;
3
+ − 2766 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2767 if ($read_identity == 1){
+ − 2768 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 2769 }
+ − 2770 else{
+ − 2771 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 2772 }
+ − 2773 }
+ − 2774 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 2775 $counting{total_meCpG_count}++;
3
+ − 2776 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2777 if ($read_identity == 1){
+ − 2778 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 2779 }
+ − 2780 else{
+ − 2781 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 2782 }
+ − 2783 }
+ − 2784 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 2785 $counting{total_unmethylated_CpG_count}++;
3
+ − 2786 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2787 if ($read_identity == 1){
+ − 2788 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 2789 }
+ − 2790 else{
+ − 2791 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 2792 }
+ − 2793 }
+ − 2794 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 2795 $counting{total_meCHH_count}++;
3
+ − 2796 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2797 if ($read_identity == 1){
+ − 2798 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 2799 }
+ − 2800 else{
+ − 2801 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 2802 }
+ − 2803 }
+ − 2804 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 2805 $counting{total_unmethylated_CHH_count}++;
3
+ − 2806 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2807 if ($read_identity == 1){
+ − 2808 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 2809 }
+ − 2810 else{
+ − 2811 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 2812 }
0
+ − 2813 }
+ − 2814 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 2815 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 2816 else{
+ − 2817 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2818 }
+ − 2819 }
+ − 2820 } elsif ($strand eq '-') {
+ − 2821 for my $index (0..$#methylation_calls) {
+ − 2822
+ − 2823 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 2824 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 2825 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2826 $cigar_offset += $cigar_mod;
+ − 2827 $pos_offset += $pos_mod;
+ − 2828 }
+ − 2829
+ − 2830 ### Returning as soon as the methylation calls start overlapping
+ − 2831 if ($start-$index+$pos_offset <= $end_read_1) {
+ − 2832 return;
+ − 2833 }
+ − 2834
+ − 2835 if ($methylation_calls[$index] eq 'X') {
+ − 2836 $counting{total_meCHG_count}++;
3
+ − 2837 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2838 if ($read_identity == 1){
+ − 2839 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 2840 }
+ − 2841 else{
+ − 2842 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 2843 }
+ − 2844 }
+ − 2845 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 2846 $counting{total_unmethylated_CHG_count}++;
3
+ − 2847 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2848 if ($read_identity == 1){
+ − 2849 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 2850 }
+ − 2851 else{
+ − 2852 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 2853 }
+ − 2854 }
+ − 2855 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 2856 $counting{total_meCpG_count}++;
3
+ − 2857 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2858 if ($read_identity == 1){
+ − 2859 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 2860 }
+ − 2861 else{
+ − 2862 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 2863 }
+ − 2864 }
+ − 2865 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 2866 $counting{total_unmethylated_CpG_count}++;
3
+ − 2867 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2868 if ($read_identity == 1){
+ − 2869 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 2870 }
+ − 2871 else{
+ − 2872 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 2873 }
+ − 2874 }
+ − 2875 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 2876 $counting{total_meCHH_count}++;
3
+ − 2877 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2878 if ($read_identity == 1){
+ − 2879 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 2880 }
+ − 2881 else{
+ − 2882 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 2883 }
+ − 2884 }
+ − 2885 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 2886 $counting{total_unmethylated_CHH_count}++;
3
+ − 2887 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2888 if ($read_identity == 1){
+ − 2889 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 2890 }
+ − 2891 else{
+ − 2892 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 2893 }
0
+ − 2894 }
+ − 2895 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 2896 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 2897 else{
+ − 2898 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2899 }
+ − 2900 }
+ − 2901 } else {
+ − 2902 die "The strand orientation was neither + nor -: '$strand'/n";
+ − 2903 }
+ − 2904 }
+ − 2905 }
+ − 2906
+ − 2907 ### this is the default paired-end procedure allowing overlaps and using every single C position
+ − 2908 ### Still within the 2-CONTEXT ONLY optional section
+ − 2909 else {
+ − 2910 ### single-file CpG and non-CpG context output
+ − 2911 if ($full) {
+ − 2912 if ($strand eq '+') {
+ − 2913 for my $index (0..$#methylation_calls) {
+ − 2914
+ − 2915 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 2916 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2917 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 2918 $cigar_offset += $cigar_mod;
+ − 2919 $pos_offset += $pos_mod;
+ − 2920 }
+ − 2921
+ − 2922 if ($methylation_calls[$index] eq 'X') {
+ − 2923 $counting{total_meCHG_count}++;
3
+ − 2924 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2925 if ($read_identity == 1){
+ − 2926 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 2927 }
+ − 2928 else{
+ − 2929 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 2930 }
+ − 2931 }
+ − 2932 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 2933 $counting{total_unmethylated_CHG_count}++;
3
+ − 2934 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2935 if ($read_identity == 1){
+ − 2936 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 2937 }
+ − 2938 else{
+ − 2939 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 2940 }
+ − 2941 }
+ − 2942 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 2943 $counting{total_meCpG_count}++;
3
+ − 2944 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2945 if ($read_identity == 1){
+ − 2946 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 2947 }
+ − 2948 else{
+ − 2949 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 2950 }
+ − 2951 }
+ − 2952 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 2953 $counting{total_unmethylated_CpG_count}++;
3
+ − 2954 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2955 if ($read_identity == 1){
+ − 2956 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 2957 }
+ − 2958 else{
+ − 2959 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 2960 }
+ − 2961 }
+ − 2962 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 2963 $counting{total_meCHH_count}++;
3
+ − 2964 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2965 if ($read_identity == 1){
+ − 2966 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 2967 }
+ − 2968 else{
+ − 2969 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 2970 }
+ − 2971 }
+ − 2972 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 2973 $counting{total_unmethylated_CHH_count}++;
3
+ − 2974 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 2975 if ($read_identity == 1){
+ − 2976 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 2977 }
+ − 2978 else{
+ − 2979 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 2980 }
0
+ − 2981 }
+ − 2982 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 2983 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 2984 else{
3
+ − 2985 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
0
+ − 2986 }
+ − 2987 }
+ − 2988 } elsif ($strand eq '-') {
+ − 2989 for my $index (0..$#methylation_calls) {
+ − 2990
+ − 2991 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 2992 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 2993 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2994 $cigar_offset += $cigar_mod;
+ − 2995 $pos_offset += $pos_mod;
+ − 2996 }
+ − 2997
+ − 2998 if ($methylation_calls[$index] eq 'X') {
+ − 2999 $counting{total_meCHG_count}++;
3
+ − 3000 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3001 if ($read_identity == 1){
+ − 3002 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 3003 }
+ − 3004 else{
+ − 3005 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 3006 }
+ − 3007 }
+ − 3008 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 3009 $counting{total_unmethylated_CHG_count}++;
3
+ − 3010 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3011 if ($read_identity == 1){
+ − 3012 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 3013 }
+ − 3014 else{
+ − 3015 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 3016 }
+ − 3017 }
+ − 3018 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 3019 $counting{total_meCpG_count}++;
3
+ − 3020 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3021 if ($read_identity == 1){
+ − 3022 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 3023 }
+ − 3024 else{
+ − 3025 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 3026 }
+ − 3027 }
+ − 3028 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 3029 $counting{total_unmethylated_CpG_count}++;
3
+ − 3030 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3031 if ($read_identity == 1){
+ − 3032 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 3033 }
+ − 3034 else{
+ − 3035 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 3036 }
+ − 3037 }
+ − 3038 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 3039 $counting{total_meCHH_count}++;
3
+ − 3040 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3041 if ($read_identity == 1){
+ − 3042 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 3043 }
+ − 3044 else{
+ − 3045 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 3046 }
+ − 3047 }
+ − 3048 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 3049 $counting{total_unmethylated_CHH_count}++;
3
+ − 3050 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3051 if ($read_identity == 1){
+ − 3052 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 3053 }
+ − 3054 else{
+ − 3055 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 3056 }
0
+ − 3057 }
+ − 3058 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 3059 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 3060 else{
3
+ − 3061 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
0
+ − 3062 }
+ − 3063 }
+ − 3064 } else {
+ − 3065 die "The strand orientation as neither + nor -: '$strand'\n";
+ − 3066 }
+ − 3067 }
+ − 3068
+ − 3069 ### strand-specific methylation output
+ − 3070 ### still within the 2-CONTEXT optional section
+ − 3071 else {
+ − 3072 if ($strand eq '+') {
+ − 3073 for my $index (0..$#methylation_calls) {
+ − 3074
+ − 3075 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 3076 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 3077 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 3078 $cigar_offset += $cigar_mod;
+ − 3079 $pos_offset += $pos_mod;
+ − 3080 }
+ − 3081
+ − 3082 if ($methylation_calls[$index] eq 'X') {
+ − 3083 $counting{total_meCHG_count}++;
3
+ − 3084 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3085 if ($read_identity == 1){
+ − 3086 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 3087 }
+ − 3088 else{
+ − 3089 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 3090 }
+ − 3091 }
+ − 3092 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 3093 $counting{total_unmethylated_CHG_count}++;
3
+ − 3094 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3095 if ($read_identity == 1){
+ − 3096 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 3097 }
+ − 3098 else{
+ − 3099 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 3100 }
+ − 3101 }
+ − 3102 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 3103 $counting{total_meCpG_count}++;
3
+ − 3104 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3105 if ($read_identity == 1){
+ − 3106 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 3107 }
+ − 3108 else{
+ − 3109 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 3110 }
+ − 3111 }
+ − 3112 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 3113 $counting{total_unmethylated_CpG_count}++;
3
+ − 3114 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3115 if ($read_identity == 1){
+ − 3116 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 3117 }
+ − 3118 else{
+ − 3119 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 3120 }
+ − 3121 }
+ − 3122 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 3123 $counting{total_meCHH_count}++;
3
+ − 3124 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3125 if ($read_identity == 1){
+ − 3126 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 3127 }
+ − 3128 else{
+ − 3129 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 3130 }
+ − 3131 }
+ − 3132 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 3133 $counting{total_unmethylated_CHH_count}++;
3
+ − 3134 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3135 if ($read_identity == 1){
+ − 3136 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 3137 }
+ − 3138 else{
+ − 3139 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 3140 }
0
+ − 3141 }
+ − 3142 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 3143 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 3144 else{
+ − 3145 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 3146 }
+ − 3147 }
+ − 3148 } elsif ($strand eq '-') {
+ − 3149 for my $index (0..$#methylation_calls) {
+ − 3150
+ − 3151 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 3152 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 3153 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 3154 $cigar_offset += $cigar_mod;
+ − 3155 $pos_offset += $pos_mod;
+ − 3156 }
+ − 3157
+ − 3158 if ($methylation_calls[$index] eq 'X') {
+ − 3159 $counting{total_meCHG_count}++;
3
+ − 3160 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3161 if ($read_identity == 1){
+ − 3162 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 3163 }
+ − 3164 else{
+ − 3165 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 3166 }
+ − 3167 }
+ − 3168 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 3169 $counting{total_unmethylated_CHG_count}++;
3
+ − 3170 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3171 if ($read_identity == 1){
+ − 3172 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 3173 }
+ − 3174 else{
+ − 3175 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 3176 }
+ − 3177 }
+ − 3178 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 3179 $counting{total_meCpG_count}++;
3
+ − 3180 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3181 if ($read_identity == 1){
+ − 3182 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 3183 }
+ − 3184 else{
+ − 3185 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 3186 }
+ − 3187 }
+ − 3188 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 3189 $counting{total_unmethylated_CpG_count}++;
3
+ − 3190 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3191 if ($read_identity == 1){
+ − 3192 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 3193 }
+ − 3194 else{
+ − 3195 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 3196 }
+ − 3197 }
+ − 3198 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 3199 $counting{total_meCHH_count}++;
3
+ − 3200 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3201 if ($read_identity == 1){
+ − 3202 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 3203 }
+ − 3204 else{
+ − 3205 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 3206 }
+ − 3207 }
+ − 3208 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 3209 $counting{total_unmethylated_CHH_count}++;
3
+ − 3210 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3211 if ($read_identity == 1){
+ − 3212 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 3213 }
+ − 3214 else{
+ − 3215 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 3216 }
0
+ − 3217 }
+ − 3218 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 3219 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 3220 else{
+ − 3221 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 3222 }
+ − 3223 }
+ − 3224 } else {
+ − 3225 die "The strand orientation as neither + nor -: '$strand'\n";
+ − 3226 }
+ − 3227 }
+ − 3228 }
+ − 3229 }
+ − 3230
+ − 3231 ############################################
+ − 3232 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
+ − 3233 ############################################
+ − 3234
+ − 3235 elsif ($no_overlap) {
+ − 3236 ### single-file CpG, CHG and CHH context output
+ − 3237 if ($full) {
+ − 3238 if ($strand eq '+') {
+ − 3239 for my $index (0..$#methylation_calls) {
+ − 3240
+ − 3241 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 3242 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 3243 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 3244 $cigar_offset += $cigar_mod;
+ − 3245 $pos_offset += $pos_mod;
+ − 3246 }
+ − 3247
+ − 3248 ### Returning as soon as the methylation calls start overlapping
+ − 3249 if ($start+$index+$pos_offset >= $end_read_1) {
+ − 3250 return;
+ − 3251 }
+ − 3252
+ − 3253 if ($methylation_calls[$index] eq 'X') {
+ − 3254 $counting{total_meCHG_count}++;
3
+ − 3255 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3256 if ($read_identity == 1){
+ − 3257 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 3258 }
+ − 3259 else{
+ − 3260 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 3261 }
+ − 3262 }
+ − 3263 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 3264 $counting{total_unmethylated_CHG_count}++;
3
+ − 3265 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3266 if ($read_identity == 1){
+ − 3267 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 3268 }
+ − 3269 else{
+ − 3270 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 3271 }
+ − 3272 }
+ − 3273 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 3274 $counting{total_meCpG_count}++;
3
+ − 3275 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3276 if ($read_identity == 1){
+ − 3277 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 3278 }
+ − 3279 else{
+ − 3280 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 3281 }
+ − 3282 }
+ − 3283 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 3284 $counting{total_unmethylated_CpG_count}++;
3
+ − 3285 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3286 if ($read_identity == 1){
+ − 3287 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 3288 }
+ − 3289 else{
+ − 3290 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 3291 }
+ − 3292 }
+ − 3293 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 3294 $counting{total_meCHH_count}++;
3
+ − 3295 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3296 if ($read_identity == 1){
+ − 3297 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 3298 }
+ − 3299 else{
+ − 3300 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 3301 }
+ − 3302 }
+ − 3303 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 3304 $counting{total_unmethylated_CHH_count}++;
3
+ − 3305 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3306 if ($read_identity == 1){
+ − 3307 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 3308 }
+ − 3309 else{
+ − 3310 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 3311 }
0
+ − 3312 }
+ − 3313 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 3314 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 3315 else{
+ − 3316 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 3317 }
+ − 3318 }
+ − 3319 } elsif ($strand eq '-') {
+ − 3320 for my $index (0..$#methylation_calls) {
+ − 3321
+ − 3322 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 3323 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 3324 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 3325 $cigar_offset += $cigar_mod;
+ − 3326 $pos_offset += $pos_mod;
+ − 3327 }
+ − 3328
+ − 3329 ### Returning as soon as the methylation calls start overlapping
+ − 3330 if ($start-$index+$pos_offset <= $end_read_1) {
+ − 3331 return;
+ − 3332 }
+ − 3333
+ − 3334 if ($methylation_calls[$index] eq 'X') {
+ − 3335 $counting{total_meCHG_count}++;
3
+ − 3336 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3337 if ($read_identity == 1){
+ − 3338 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 3339 }
+ − 3340 else{
+ − 3341 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 3342 }
+ − 3343 }
+ − 3344 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 3345 $counting{total_unmethylated_CHG_count}++;
3
+ − 3346 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3347 if ($read_identity == 1){
+ − 3348 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 3349 }
+ − 3350 else{
+ − 3351 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 3352 }
+ − 3353 }
+ − 3354 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 3355 $counting{total_meCpG_count}++;
3
+ − 3356 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3357 if ($read_identity == 1){
+ − 3358 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 3359 }
+ − 3360 else{
+ − 3361 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 3362 }
+ − 3363 }
+ − 3364 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 3365 $counting{total_unmethylated_CpG_count}++;
3
+ − 3366 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3367 if ($read_identity == 1){
+ − 3368 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 3369 }
+ − 3370 else{
+ − 3371 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 3372 }
+ − 3373 }
+ − 3374 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 3375 $counting{total_meCHH_count}++;
3
+ − 3376 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3377 if ($read_identity == 1){
+ − 3378 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 3379 }
+ − 3380 else{
+ − 3381 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 3382 }
+ − 3383 }
+ − 3384 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 3385 $counting{total_unmethylated_CHH_count}++;
3
+ − 3386 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3387 if ($read_identity == 1){
+ − 3388 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 3389 }
+ − 3390 else{
+ − 3391 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 3392 }
0
+ − 3393 }
+ − 3394 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 3395 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 3396 else{
+ − 3397 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 3398 }
+ − 3399 }
+ − 3400 } else {
+ − 3401 die "The strand orientation as neither + nor -: '$strand'\n";
+ − 3402 }
+ − 3403 }
+ − 3404
+ − 3405 ### strand-specific methylation output
+ − 3406 else {
+ − 3407 if ($strand eq '+') {
+ − 3408 for my $index (0..$#methylation_calls) {
+ − 3409
+ − 3410 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 3411 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 3412 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 3413 $cigar_offset += $cigar_mod;
+ − 3414 $pos_offset += $pos_mod;
+ − 3415 }
+ − 3416
+ − 3417 ### Returning as soon as the methylation calls start overlapping
+ − 3418 if ($start+$index+$pos_offset >= $end_read_1) {
+ − 3419 return;
+ − 3420 }
+ − 3421
+ − 3422 if ($methylation_calls[$index] eq 'X') {
+ − 3423 $counting{total_meCHG_count}++;
3
+ − 3424 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3425 if ($read_identity == 1){
+ − 3426 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 3427 }
+ − 3428 else{
+ − 3429 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 3430 }
+ − 3431 }
+ − 3432 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 3433 $counting{total_unmethylated_CHG_count}++;
3
+ − 3434 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3435 if ($read_identity == 1){
+ − 3436 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 3437 }
+ − 3438 else{
+ − 3439 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 3440 }
+ − 3441 }
+ − 3442 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 3443 $counting{total_meCpG_count}++;
3
+ − 3444 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3445 if ($read_identity == 1){
+ − 3446 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 3447 }
+ − 3448 else{
+ − 3449 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 3450 }
+ − 3451 }
+ − 3452 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 3453 $counting{total_unmethylated_CpG_count}++;
3
+ − 3454 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3455 if ($read_identity == 1){
+ − 3456 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 3457 }
+ − 3458 else{
+ − 3459 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 3460 }
+ − 3461 }
+ − 3462 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 3463 $counting{total_meCHH_count}++;
3
+ − 3464 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3465 if ($read_identity == 1){
+ − 3466 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 3467 }
+ − 3468 else{
+ − 3469 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 3470 }
+ − 3471 }
+ − 3472 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 3473 $counting{total_unmethylated_CHH_count}++;
3
+ − 3474 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3475 if ($read_identity == 1){
+ − 3476 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 3477 }
+ − 3478 else{
+ − 3479 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 3480 }
0
+ − 3481 }
+ − 3482 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 3483 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 3484 else{
+ − 3485 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 3486 }
+ − 3487 }
+ − 3488 } elsif ($strand eq '-') {
+ − 3489 for my $index (0..$#methylation_calls) {
+ − 3490
+ − 3491 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 3492 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 3493 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 3494 $cigar_offset += $cigar_mod;
+ − 3495 $pos_offset += $pos_mod;
+ − 3496 }
+ − 3497
+ − 3498 ### Returning as soon as the methylation calls start overlapping
+ − 3499 if ($start-$index+$pos_offset <= $end_read_1) {
+ − 3500 return;
+ − 3501 }
+ − 3502
+ − 3503 if ($methylation_calls[$index] eq 'X') {
+ − 3504 $counting{total_meCHG_count}++;
3
+ − 3505 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3506 if ($read_identity == 1){
+ − 3507 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 3508 }
+ − 3509 else{
+ − 3510 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 3511 }
+ − 3512 }
+ − 3513 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 3514 $counting{total_unmethylated_CHG_count}++;
3
+ − 3515 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3516 if ($read_identity == 1){
+ − 3517 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 3518 }
+ − 3519 else{
+ − 3520 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 3521 }
+ − 3522 }
+ − 3523 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 3524 $counting{total_meCpG_count}++;
3
+ − 3525 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3526 if ($read_identity == 1){
+ − 3527 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 3528 }
+ − 3529 else{
+ − 3530 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 3531 }
+ − 3532 }
+ − 3533 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 3534 $counting{total_unmethylated_CpG_count}++;
3
+ − 3535 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3536 if ($read_identity == 1){
+ − 3537 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 3538 }
+ − 3539 else{
+ − 3540 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 3541 }
+ − 3542 }
+ − 3543 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 3544 $counting{total_meCHH_count}++;
3
+ − 3545 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3546 if ($read_identity == 1){
+ − 3547 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 3548 }
+ − 3549 else{
+ − 3550 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 3551 }
+ − 3552 }
+ − 3553 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 3554 $counting{total_unmethylated_CHH_count}++;
3
+ − 3555 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3556 if ($read_identity == 1){
+ − 3557 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 3558 }
+ − 3559 else{
+ − 3560 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 3561 }
0
+ − 3562 }
+ − 3563 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 3564 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 3565 else{
+ − 3566 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 3567 }
+ − 3568 }
+ − 3569 } else {
+ − 3570 die "The strand orientation as neither + nor -: '$strand'\n";
+ − 3571 }
+ − 3572 }
+ − 3573 }
+ − 3574
+ − 3575 ### this is the default paired-end procedure allowing overlaps and using every single C position
+ − 3576 else {
+ − 3577 ### single-file CpG, CHG and CHH context output
+ − 3578 if ($full) {
+ − 3579 if ($strand eq '+') {
+ − 3580 for my $index (0..$#methylation_calls) {
+ − 3581
+ − 3582 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 3583 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 3584 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 3585 $cigar_offset += $cigar_mod;
+ − 3586 $pos_offset += $pos_mod;
+ − 3587 }
+ − 3588
+ − 3589 if ($methylation_calls[$index] eq 'X') {
+ − 3590 $counting{total_meCHG_count}++;
3
+ − 3591 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3592 if ($read_identity == 1){
+ − 3593 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 3594 }
+ − 3595 else{
+ − 3596 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 3597 }
+ − 3598 }
+ − 3599 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 3600 $counting{total_unmethylated_CHG_count}++;
3
+ − 3601 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3602 if ($read_identity == 1){
+ − 3603 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 3604 }
+ − 3605 else{
+ − 3606 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 3607 }
+ − 3608 }
+ − 3609 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 3610 $counting{total_meCpG_count}++;
3
+ − 3611 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3612 if ($read_identity == 1){
+ − 3613 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 3614 }
+ − 3615 else{
+ − 3616 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 3617 }
+ − 3618 }
+ − 3619 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 3620 $counting{total_unmethylated_CpG_count}++;
3
+ − 3621 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3622 if ($read_identity == 1){
+ − 3623 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 3624 }
+ − 3625 else{
+ − 3626 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 3627 }
+ − 3628 }
+ − 3629 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 3630 $counting{total_meCHH_count}++;
3
+ − 3631 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3632 if ($read_identity == 1){
+ − 3633 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 3634 }
+ − 3635 else{
+ − 3636 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 3637 }
+ − 3638 }
+ − 3639 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 3640 $counting{total_unmethylated_CHH_count}++;
3
+ − 3641 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3642 if ($read_identity == 1){
+ − 3643 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 3644 }
+ − 3645 else{
+ − 3646 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 3647 }
0
+ − 3648 }
+ − 3649 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 3650 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 3651 else{
+ − 3652 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 3653 }
+ − 3654 }
+ − 3655 } elsif ($strand eq '-') {
+ − 3656 for my $index (0..$#methylation_calls) {
+ − 3657
+ − 3658 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 3659 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 3660 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 3661 $cigar_offset += $cigar_mod;
+ − 3662 $pos_offset += $pos_mod;
+ − 3663 }
+ − 3664
+ − 3665 if ($methylation_calls[$index] eq 'X') {
+ − 3666 $counting{total_meCHG_count}++;
3
+ − 3667 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3668 if ($read_identity == 1){
+ − 3669 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 3670 }
+ − 3671 else{
+ − 3672 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 3673 }
+ − 3674 }
+ − 3675 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 3676 $counting{total_unmethylated_CHG_count}++;
3
+ − 3677 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3678 if ($read_identity == 1){
+ − 3679 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 3680 }
+ − 3681 else{
+ − 3682 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 3683 }
+ − 3684 }
+ − 3685 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 3686 $counting{total_meCpG_count}++;
3
+ − 3687 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3688 if ($read_identity == 1){
+ − 3689 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 3690 }
+ − 3691 else{
+ − 3692 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 3693 }
+ − 3694 }
+ − 3695 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 3696 $counting{total_unmethylated_CpG_count}++;
3
+ − 3697 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3698 if ($read_identity == 1){
+ − 3699 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 3700 }
+ − 3701 else{
+ − 3702 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 3703 }
+ − 3704 }
+ − 3705 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 3706 $counting{total_meCHH_count}++;
3
+ − 3707 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3708 if ($read_identity == 1){
+ − 3709 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 3710 }
+ − 3711 else{
+ − 3712 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 3713 }
+ − 3714 }
+ − 3715 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 3716 $counting{total_unmethylated_CHH_count}++;
3
+ − 3717 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3718 if ($read_identity == 1){
+ − 3719 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 3720 }
+ − 3721 else{
+ − 3722 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 3723 }
0
+ − 3724 }
+ − 3725 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 3726 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 3727 else{
+ − 3728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 3729 }
+ − 3730 }
+ − 3731 } else {
+ − 3732 die "The strand orientation as neither + nor -: '$strand'\n";
+ − 3733 }
+ − 3734 }
+ − 3735
+ − 3736 ### strand-specific methylation output
+ − 3737 else {
+ − 3738 if ($strand eq '+') {
+ − 3739 for my $index (0..$#methylation_calls) {
+ − 3740
+ − 3741 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 3742 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 3743 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 3744 $cigar_offset += $cigar_mod;
+ − 3745 $pos_offset += $pos_mod;
+ − 3746 }
+ − 3747
+ − 3748 if ($methylation_calls[$index] eq 'X') {
+ − 3749 $counting{total_meCHG_count}++;
3
+ − 3750 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3751 if ($read_identity == 1){
+ − 3752 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 3753 }
+ − 3754 else{
+ − 3755 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 3756 }
+ − 3757 }
+ − 3758 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 3759 $counting{total_unmethylated_CHG_count}++;
3
+ − 3760 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3761 if ($read_identity == 1){
+ − 3762 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 3763 }
+ − 3764 else{
+ − 3765 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 3766 }
+ − 3767 }
+ − 3768 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 3769 $counting{total_meCpG_count}++;
3
+ − 3770 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3771 if ($read_identity == 1){
+ − 3772 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 3773 }
+ − 3774 else{
+ − 3775 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 3776 }
+ − 3777 }
+ − 3778 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 3779 $counting{total_unmethylated_CpG_count}++;
3
+ − 3780 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3781 if ($read_identity == 1){
+ − 3782 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 3783 }
+ − 3784 else{
+ − 3785 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 3786 }
+ − 3787 }
+ − 3788 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 3789 $counting{total_meCHH_count}++;
3
+ − 3790 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3791 if ($read_identity == 1){
+ − 3792 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 3793 }
+ − 3794 else{
+ − 3795 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 3796 }
+ − 3797 }
+ − 3798 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 3799 $counting{total_unmethylated_CHH_count}++;
3
+ − 3800 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3801 if ($read_identity == 1){
+ − 3802 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 3803 }
+ − 3804 else{
+ − 3805 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 3806 }
0
+ − 3807 }
+ − 3808 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 3809 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 3810 else{
+ − 3811 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 3812 }
+ − 3813 }
+ − 3814 } elsif ($strand eq '-') {
+ − 3815 for my $index (0..$#methylation_calls) {
+ − 3816
+ − 3817 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
+ − 3818 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 3819 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 3820 $cigar_offset += $cigar_mod;
+ − 3821 $pos_offset += $pos_mod;
+ − 3822 }
+ − 3823
+ − 3824 if ($methylation_calls[$index] eq 'X') {
+ − 3825 $counting{total_meCHG_count}++;
3
+ − 3826 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3827 if ($read_identity == 1){
+ − 3828 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 3829 }
+ − 3830 else{
+ − 3831 $mbias_2{CHG}->{$index+1}->{meth}++;
+ − 3832 }
+ − 3833 }
+ − 3834 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 3835 $counting{total_unmethylated_CHG_count}++;
3
+ − 3836 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3837 if ($read_identity == 1){
+ − 3838 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 3839 }
+ − 3840 else{
+ − 3841 $mbias_2{CHG}->{$index+1}->{un}++;
+ − 3842 }
+ − 3843 }
+ − 3844 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 3845 $counting{total_meCpG_count}++;
3
+ − 3846 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3847 if ($read_identity == 1){
+ − 3848 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 3849 }
+ − 3850 else{
+ − 3851 $mbias_2{CpG}->{$index+1}->{meth}++;
+ − 3852 }
+ − 3853 }
+ − 3854 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 3855 $counting{total_unmethylated_CpG_count}++;
3
+ − 3856 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3857 if ($read_identity == 1){
+ − 3858 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 3859 }
+ − 3860 else{
+ − 3861 $mbias_2{CpG}->{$index+1}->{un}++;
+ − 3862 }
+ − 3863 }
+ − 3864 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 3865 $counting{total_meCHH_count}++;
3
+ − 3866 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3867 if ($read_identity == 1){
+ − 3868 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 3869 }
+ − 3870 else{
+ − 3871 $mbias_2{CHH}->{$index+1}->{meth}++;
+ − 3872 }
+ − 3873 }
+ − 3874 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 3875 $counting{total_unmethylated_CHH_count}++;
3
+ − 3876 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 3877 if ($read_identity == 1){
+ − 3878 $mbias_1{CHH}->{$index+1}->{un}++;
+ − 3879 }
+ − 3880 else{
+ − 3881 $mbias_2{CHH}->{$index+1}->{un}++;
+ − 3882 }
0
+ − 3883 }
+ − 3884 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 3885 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 3886 else{
+ − 3887 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 3888 }
+ − 3889 }
+ − 3890 } else {
+ − 3891 die "The strand orientation as neither + nor -: '$strand'\n";
+ − 3892 }
+ − 3893 }
+ − 3894 }
+ − 3895 }
+ − 3896
+ − 3897 sub check_cigar_string {
+ − 3898 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
+ − 3899 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
+ − 3900 my ($new_cigar_offset,$new_pos_offset) = (0,0);
+ − 3901
+ − 3902 if ($strand eq '+') {
+ − 3903 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
+ − 3904
+ − 3905 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
+ − 3906 # warn "position needs no adjustment\n";
+ − 3907 }
+ − 3908
+ − 3909 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
+ − 3910 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
+ − 3911 # warn "adjusted genomic position by -1 bp (insertion)\n";
+ − 3912 }
+ − 3913
+ − 3914 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
+ − 3915 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
+ − 3916 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
+ − 3917 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
+ − 3918
+ − 3919 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
+ − 3920 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
+ − 3921 # warn "position needs no adjustment\n";
+ − 3922 last;
+ − 3923 }
+ − 3924 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
+ − 3925 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
+ − 3926 # warn "adjusted genomic position by another -1 bp (insertion)\n";
+ − 3927 last;
+ − 3928 }
+ − 3929 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
+ − 3930 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
+ − 3931 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
+ − 3932 # warn "adjusted genomic position by another +1 bp (deletion)\n";
+ − 3933 }
+ − 3934 else{
+ − 3935 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
+ − 3936 }
+ − 3937 }
+ − 3938 }
+ − 3939 else{
+ − 3940 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
+ − 3941 }
+ − 3942 }
+ − 3943
+ − 3944 elsif ($strand eq '-') {
+ − 3945 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
+ − 3946
+ − 3947 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
+ − 3948 # warn "position needs no adjustment\n";
+ − 3949 }
+ − 3950
+ − 3951 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
+ − 3952 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
+ − 3953 # warn "adjusted genomic position by +1 bp (insertion)\n";
+ − 3954 }
+ − 3955
+ − 3956 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
+ − 3957 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
+ − 3958 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
+ − 3959 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
+ − 3960
+ − 3961 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
+ − 3962 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
+ − 3963 # warn "Found new 'M' operation; position needs no adjustment\n";
+ − 3964 last;
+ − 3965 }
+ − 3966 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
+ − 3967 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
+ − 3968 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
+ − 3969 last;
+ − 3970 }
+ − 3971 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
+ − 3972 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
+ − 3973 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
+ − 3974 # warn "adjusted genomic position by another -1 bp (deletion)\n";
+ − 3975 }
+ − 3976 else{
+ − 3977 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
+ − 3978 }
+ − 3979 }
+ − 3980 }
+ − 3981 else{
+ − 3982 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
+ − 3983 }
+ − 3984 }
+ − 3985 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
+ − 3986 return ($new_cigar_offset,$new_pos_offset);
+ − 3987 }
+ − 3988
+ − 3989 sub print_individual_C_methylation_states_single_end{
+ − 3990
+ − 3991 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
+ − 3992 my @methylation_calls = split(//,$meth_call);
+ − 3993
+ − 3994 #################################################################
+ − 3995 ### . for bases not involving cytosines ###
+ − 3996 ### X for methylated C in CHG context (was protected) ###
+ − 3997 ### x for not methylated C in CHG context (was converted) ###
+ − 3998 ### H for methylated C in CHH context (was protected) ###
+ − 3999 ### h for not methylated C in CHH context (was converted) ###
+ − 4000 ### Z for methylated C in CpG context (was protected) ###
+ − 4001 ### z for not methylated C in CpG context (was converted) ###
+ − 4002 #################################################################
+ − 4003
+ − 4004 my $methyl_CHG_count = 0;
+ − 4005 my $methyl_CHH_count = 0;
+ − 4006 my $methyl_CpG_count = 0;
+ − 4007 my $unmethylated_CHG_count = 0;
+ − 4008 my $unmethylated_CHH_count = 0;
+ − 4009 my $unmethylated_CpG_count = 0;
+ − 4010
+ − 4011 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
+ − 4012 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
+ − 4013
+ − 4014 my @comp_cigar;
+ − 4015
+ − 4016 if ($cigar){ # parsing CIGAR string
+ − 4017
+ − 4018 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
+ − 4019 if ($cigar =~ /^\d+M$/){
+ − 4020 # warn "See!? I told you so! $cigar\n";
+ − 4021 # sleep(1);
+ − 4022 }
+ − 4023 else{
+ − 4024
+ − 4025 my @len;
+ − 4026 my @ops;
+ − 4027
+ − 4028 @len = split (/\D+/,$cigar); # storing the length per operation
+ − 4029 @ops = split (/\d+/,$cigar); # storing the operation
+ − 4030 shift @ops; # remove the empty first element
3
+ − 4031 # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops);
0
+ − 4032 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
3
+ − 4033
0
+ − 4034 foreach my $index (0..$#len){
+ − 4035 foreach (1..$len[$index]){
+ − 4036 # print "$ops[$index]";
+ − 4037 push @comp_cigar, $ops[$index];
+ − 4038 }
+ − 4039 }
+ − 4040 }
+ − 4041 # warn "\nDetected CIGAR string: $cigar\n";
+ − 4042 # warn "Length of methylation call: ",length $meth_call,"\n";
+ − 4043 # warn "number of operations: ",scalar @ops,"\n";
+ − 4044 # warn "number of length digits: ",scalar @len,"\n\n";
+ − 4045 # print @comp_cigar,"\n";
+ − 4046 # print "$meth_call\n\n";
+ − 4047 # sleep (1);
+ − 4048 }
3
+ − 4049
0
+ − 4050 ### adjusting the start position for all reads mapping to the reverse strand
+ − 4051 if ($strand eq '-') {
+ − 4052
+ − 4053 if (@comp_cigar){ # only needed for SAM reads with InDels
+ − 4054 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
+ − 4055 # print @comp_cigar,"\n";
+ − 4056 }
+ − 4057
+ − 4058 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
3
+ − 4059
0
+ − 4060 if ($cigar){ ### SAM format
+ − 4061 if ($cigar =~ /^(\d+)M$/){ # linear match
+ − 4062 $start += $1 - 1;
+ − 4063 }
+ − 4064 else{ # InDel read
+ − 4065 my $MD_count = 0;
+ − 4066 foreach (@comp_cigar){
+ − 4067 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
+ − 4068 }
+ − 4069 $start += $MD_count - 1;
+ − 4070 }
+ − 4071 }
+ − 4072 else{ ### vanilla format
+ − 4073 $start += length($meth_call)-1;
+ − 4074 }
+ − 4075 }
+ − 4076 }
+ − 4077
+ − 4078 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
+ − 4079
+ − 4080 ### single-file CpG and other-context output
+ − 4081 if ($full and $merge_non_CpG) {
+ − 4082 if ($strand eq '+') {
+ − 4083 for my $index (0..$#methylation_calls) {
+ − 4084
+ − 4085 if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels
+ − 4086 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 4087 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 4088 $cigar_offset += $cigar_mod;
+ − 4089 $pos_offset += $pos_mod;
+ − 4090 }
+ − 4091
+ − 4092 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 4093 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 4094 if ($methylation_calls[$index] eq 'X') {
+ − 4095 $counting{total_meCHG_count}++;
3
+ − 4096 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4097 $mbias_1{CHG}->{$index+1}->{meth}++;
0
+ − 4098 }
+ − 4099 elsif ($methylation_calls[$index] eq 'x') {
+ − 4100 $counting{total_unmethylated_CHG_count}++;
3
+ − 4101 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4102 $mbias_1{CHG}->{$index+1}->{un}++;
0
+ − 4103 }
+ − 4104 elsif ($methylation_calls[$index] eq 'Z') {
+ − 4105 $counting{total_meCpG_count}++;
3
+ − 4106 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4107 $mbias_1{CpG}->{$index+1}->{meth}++;
0
+ − 4108 }
+ − 4109 elsif ($methylation_calls[$index] eq 'z') {
+ − 4110 $counting{total_unmethylated_CpG_count}++;
3
+ − 4111 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4112 $mbias_1{CpG}->{$index+1}->{un}++;
0
+ − 4113 }
+ − 4114 elsif ($methylation_calls[$index] eq 'H') {
+ − 4115 $counting{total_meCHH_count}++;
3
+ − 4116 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4117 $mbias_1{CHH}->{$index+1}->{meth}++;
0
+ − 4118 }
+ − 4119 elsif ($methylation_calls[$index] eq 'h') {
+ − 4120 $counting{total_unmethylated_CHH_count}++;
3
+ − 4121 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4122 $mbias_1{CHH}->{$index+1}->{un}++;
0
+ − 4123 }
3
+ − 4124 elsif ($methylation_calls[$index] eq '.') {}
+ − 4125 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 4126 else{
+ − 4127 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 4128 }
+ − 4129 }
+ − 4130 }
+ − 4131 elsif ($strand eq '-') {
+ − 4132
+ − 4133 for my $index (0..$#methylation_calls) {
+ − 4134 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 4135 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 4136
+ − 4137 if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels
+ − 4138 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 4139 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 4140 $cigar_offset += $cigar_mod;
+ − 4141 $pos_offset += $pos_mod;
+ − 4142 }
+ − 4143
+ − 4144 if ($methylation_calls[$index] eq 'X') {
+ − 4145 $counting{total_meCHG_count}++;
3
+ − 4146 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4147 $mbias_1{CHG}->{$index+1}->{meth}++;
0
+ − 4148 }
+ − 4149 elsif ($methylation_calls[$index] eq 'x') {
+ − 4150 $counting{total_unmethylated_CHG_count}++;
3
+ − 4151 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4152 $mbias_1{CHG}->{$index+1}->{un}++;
0
+ − 4153 }
+ − 4154 elsif ($methylation_calls[$index] eq 'Z') {
+ − 4155 $counting{total_meCpG_count}++;
3
+ − 4156 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4157 $mbias_1{CpG}->{$index+1}->{meth}++;
0
+ − 4158 }
+ − 4159 elsif ($methylation_calls[$index] eq 'z') {
+ − 4160 $counting{total_unmethylated_CpG_count}++;
3
+ − 4161 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4162 $mbias_1{CpG}->{$index+1}->{un}++;
0
+ − 4163 }
+ − 4164 elsif ($methylation_calls[$index] eq 'H') {
+ − 4165 $counting{total_meCHH_count}++;
3
+ − 4166 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4167 $mbias_1{CHH}->{$index+1}->{meth}++;
0
+ − 4168 }
+ − 4169 elsif ($methylation_calls[$index] eq 'h') {
+ − 4170 $counting{total_unmethylated_CHH_count}++;
3
+ − 4171 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4172 $mbias_1{CHH}->{$index+1}->{un}++;
0
+ − 4173 }
3
+ − 4174 elsif ($methylation_calls[$index] eq '.'){}
+ − 4175 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 4176 else{
+ − 4177 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 4178 }
+ − 4179 }
+ − 4180 }
+ − 4181 else {
+ − 4182 die "The strand information was neither + nor -: $strand\n";
+ − 4183 }
+ − 4184 }
+ − 4185
+ − 4186 ### strand-specific methylation output
+ − 4187 elsif ($merge_non_CpG) {
+ − 4188 if ($strand eq '+') {
+ − 4189 for my $index (0..$#methylation_calls) {
+ − 4190 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 4191 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 4192
+ − 4193 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
+ − 4194 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 4195 $cigar_offset += $cigar_mod;
+ − 4196 $pos_offset += $pos_mod;
+ − 4197 }
+ − 4198
+ − 4199 if ($methylation_calls[$index] eq 'X') {
+ − 4200 $counting{total_meCHG_count}++;
3
+ − 4201 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4202 $mbias_1{CHG}->{$index+1}->{meth}++;
0
+ − 4203 }
+ − 4204 elsif ($methylation_calls[$index] eq 'x') {
+ − 4205 $counting{total_unmethylated_CHG_count}++;
3
+ − 4206 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4207 $mbias_1{CHG}->{$index+1}->{un}++;
0
+ − 4208 }
+ − 4209 elsif ($methylation_calls[$index] eq 'Z') {
+ − 4210 $counting{total_meCpG_count}++;
3
+ − 4211 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4212 $mbias_1{CpG}->{$index+1}->{meth}++;
0
+ − 4213 }
+ − 4214 elsif ($methylation_calls[$index] eq 'z') {
+ − 4215 $counting{total_unmethylated_CpG_count}++;
3
+ − 4216 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4217 $mbias_1{CpG}->{$index+1}->{un}++;
0
+ − 4218 }
+ − 4219 elsif ($methylation_calls[$index] eq 'H') {
+ − 4220 $counting{total_meCHH_count}++;
3
+ − 4221 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4222 $mbias_1{CHH}->{$index+1}->{meth}++;
0
+ − 4223 }
+ − 4224 elsif ($methylation_calls[$index] eq 'h') {
+ − 4225 $counting{total_unmethylated_CHH_count}++;
3
+ − 4226 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4227 $mbias_1{CHH}->{$index+1}->{un}++;
0
+ − 4228 }
3
+ − 4229 elsif ($methylation_calls[$index] eq '.') {}
+ − 4230 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 4231 else{
+ − 4232 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 4233 }
+ − 4234 }
+ − 4235 }
+ − 4236 elsif ($strand eq '-') {
+ − 4237
+ − 4238 for my $index (0..$#methylation_calls) {
+ − 4239 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 4240 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 4241
+ − 4242 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
+ − 4243 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 4244 $cigar_offset += $cigar_mod;
+ − 4245 $pos_offset += $pos_mod;
+ − 4246 }
+ − 4247
+ − 4248 if ($methylation_calls[$index] eq 'X') {
+ − 4249 $counting{total_meCHG_count}++;
3
+ − 4250 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4251 $mbias_1{CHG}->{$index+1}->{meth}++;
0
+ − 4252 }
+ − 4253 elsif ($methylation_calls[$index] eq 'x') {
+ − 4254 $counting{total_unmethylated_CHG_count}++;
3
+ − 4255 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4256 $mbias_1{CHG}->{$index+1}->{un}++;
0
+ − 4257 }
+ − 4258 elsif ($methylation_calls[$index] eq 'Z') {
+ − 4259 $counting{total_meCpG_count}++;
3
+ − 4260 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4261 $mbias_1{CpG}->{$index+1}->{meth}++;
0
+ − 4262 }
+ − 4263 elsif ($methylation_calls[$index] eq 'z') {
+ − 4264 $counting{total_unmethylated_CpG_count}++;
3
+ − 4265 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4266 $mbias_1{CpG}->{$index+1}->{un}++;
0
+ − 4267 }
+ − 4268 elsif ($methylation_calls[$index] eq 'H') {
+ − 4269 $counting{total_meCHH_count}++;
3
+ − 4270 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4271 $mbias_1{CHH}->{$index+1}->{meth}++;
0
+ − 4272 }
+ − 4273 elsif ($methylation_calls[$index] eq 'h') {
+ − 4274 $counting{total_unmethylated_CHH_count}++;
3
+ − 4275 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4276 $mbias_1{CHH}->{$index+1}->{un}++;
0
+ − 4277 }
3
+ − 4278 elsif ($methylation_calls[$index] eq '.') {}
+ − 4279 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 4280 else{
+ − 4281 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 4282 }
+ − 4283 }
+ − 4284 }
+ − 4285 else {
+ − 4286 die "The strand information was neither + nor -: $strand\n";
+ − 4287 }
+ − 4288 }
+ − 4289
+ − 4290 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
+ − 4291
+ − 4292 elsif ($full) {
+ − 4293 if ($strand eq '+') {
+ − 4294 for my $index (0..$#methylation_calls) {
+ − 4295 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 4296 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 4297
+ − 4298 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
+ − 4299 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 4300 $cigar_offset += $cigar_mod;
+ − 4301 $pos_offset += $pos_mod;
+ − 4302 }
3
+ − 4303
0
+ − 4304 if ($methylation_calls[$index] eq 'X') {
+ − 4305 $counting{total_meCHG_count}++;
3
+ − 4306 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4307 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 4308 }
+ − 4309 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 4310 $counting{total_unmethylated_CHG_count}++;
3
+ − 4311 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4312 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 4313 }
+ − 4314 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 4315 $counting{total_meCpG_count}++;
3
+ − 4316 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4317 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 4318 }
+ − 4319 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 4320 $counting{total_unmethylated_CpG_count}++;
3
+ − 4321 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4322 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 4323 }
+ − 4324 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 4325 $counting{total_meCHH_count}++;
3
+ − 4326 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4327 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 4328 }
+ − 4329 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 4330 $counting{total_unmethylated_CHH_count}++;
3
+ − 4331 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4332 $mbias_1{CHH}->{$index+1}->{un}++;
0
+ − 4333 }
+ − 4334 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 4335 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 4336 else{
3
+ − 4337 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
0
+ − 4338 }
+ − 4339 }
+ − 4340 }
+ − 4341 elsif ($strand eq '-') {
+ − 4342
+ − 4343 for my $index (0..$#methylation_calls) {
+ − 4344 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 4345 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 4346
+ − 4347 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
+ − 4348 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 4349 $cigar_offset += $cigar_mod;
+ − 4350 $pos_offset += $pos_mod;
+ − 4351 }
+ − 4352
+ − 4353 if ($methylation_calls[$index] eq 'X') {
+ − 4354 $counting{total_meCHG_count}++;
3
+ − 4355 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4356 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 4357 }
+ − 4358 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 4359 $counting{total_unmethylated_CHG_count}++;
3
+ − 4360 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4361 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 4362 }
+ − 4363 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 4364 $counting{total_meCpG_count}++;
3
+ − 4365 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4366 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 4367 }
+ − 4368 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 4369 $counting{total_unmethylated_CpG_count}++;
3
+ − 4370 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4371 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 4372 }
+ − 4373 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 4374 $counting{total_meCHH_count}++;
3
+ − 4375 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4376 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 4377 }
+ − 4378 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 4379 $counting{total_unmethylated_CHH_count}++;
3
+ − 4380 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4381 $mbias_1{CHH}->{$index+1}->{un}++;
0
+ − 4382 }
+ − 4383 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 4384 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 4385 else{
+ − 4386 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 4387 }
+ − 4388 }
+ − 4389 }
+ − 4390 else {
+ − 4391 die "The read had a strand orientation which was neither + nor -: $strand\n";
+ − 4392 }
+ − 4393 }
+ − 4394
+ − 4395 ### strand-specific methylation output
+ − 4396 else {
+ − 4397 if ($strand eq '+') {
+ − 4398 for my $index (0..$#methylation_calls) {
+ − 4399 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 4400 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 4401
+ − 4402 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
+ − 4403 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 4404 $cigar_offset += $cigar_mod;
+ − 4405 $pos_offset += $pos_mod;
+ − 4406 }
+ − 4407
+ − 4408 if ($methylation_calls[$index] eq 'X') {
+ − 4409 $counting{total_meCHG_count}++;
3
+ − 4410 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4411 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 4412 }
+ − 4413 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 4414 $counting{total_unmethylated_CHG_count}++;
3
+ − 4415 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4416 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 4417 }
+ − 4418 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 4419 $counting{total_meCpG_count}++;
3
+ − 4420 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4421 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 4422 }
+ − 4423 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 4424 $counting{total_unmethylated_CpG_count}++;
3
+ − 4425 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4426 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 4427 }
+ − 4428 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 4429 $counting{total_meCHH_count}++;
3
+ − 4430 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4431 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 4432 }
+ − 4433 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 4434 $counting{total_unmethylated_CHH_count}++;
3
+ − 4435 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4436 $mbias_1{CHH}->{$index+1}->{un}++;
0
+ − 4437 }
+ − 4438 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 4439 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 4440 else{
+ − 4441 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 4442 }
+ − 4443 }
+ − 4444 }
+ − 4445 elsif ($strand eq '-') {
+ − 4446
+ − 4447 for my $index (0..$#methylation_calls) {
+ − 4448 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 4449 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 4450
+ − 4451 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
+ − 4452 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 4453 $cigar_offset += $cigar_mod;
+ − 4454 $pos_offset += $pos_mod;
+ − 4455 }
+ − 4456
+ − 4457 if ($methylation_calls[$index] eq 'X') {
+ − 4458 $counting{total_meCHG_count}++;
3
+ − 4459 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4460 $mbias_1{CHG}->{$index+1}->{meth}++;
+ − 4461 }
+ − 4462 elsif ($methylation_calls[$index] eq 'x') {
0
+ − 4463 $counting{total_unmethylated_CHG_count}++;
3
+ − 4464 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4465 $mbias_1{CHG}->{$index+1}->{un}++;
+ − 4466 }
+ − 4467 elsif ($methylation_calls[$index] eq 'Z') {
0
+ − 4468 $counting{total_meCpG_count}++;
3
+ − 4469 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4470 $mbias_1{CpG}->{$index+1}->{meth}++;
+ − 4471 }
+ − 4472 elsif ($methylation_calls[$index] eq 'z') {
0
+ − 4473 $counting{total_unmethylated_CpG_count}++;
3
+ − 4474 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4475 $mbias_1{CpG}->{$index+1}->{un}++;
+ − 4476 }
+ − 4477 elsif ($methylation_calls[$index] eq 'H') {
0
+ − 4478 $counting{total_meCHH_count}++;
3
+ − 4479 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4480 $mbias_1{CHH}->{$index+1}->{meth}++;
+ − 4481 }
+ − 4482 elsif ($methylation_calls[$index] eq 'h') {
0
+ − 4483 $counting{total_unmethylated_CHH_count}++;
3
+ − 4484 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
+ − 4485 $mbias_1{CHH}->{$index+1}->{un}++;
0
+ − 4486 }
+ − 4487 elsif ($methylation_calls[$index] eq '.') {}
3
+ − 4488 elsif (lc$methylation_calls[$index] eq 'u'){}
0
+ − 4489 else{
+ − 4490 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 4491 }
+ − 4492 }
+ − 4493 }
+ − 4494 else {
+ − 4495 die "The strand information was neither + nor -: $strand\n";
+ − 4496 }
+ − 4497 }
+ − 4498 }
+ − 4499
+ − 4500
+ − 4501
+ − 4502 sub print_helpfile{
+ − 4503
+ − 4504 print << 'HOW_TO';
+ − 4505
+ − 4506
+ − 4507 DESCRIPTION
+ − 4508
+ − 4509 The following is a brief description of all options to control the Bismark
+ − 4510 methylation extractor. The script reads in a bisulfite read alignment results file
+ − 4511 produced by the Bismark bisulfite mapper and extracts the methylation information
+ − 4512 for individual cytosines. This information is found in the methylation call field
+ − 4513 which can contain the following characters:
+ − 4514
3
+ − 4515 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ − 4516 ~~~ X for methylated C in CHG context ~~~
+ − 4517 ~~~ x for not methylated C CHG ~~~
+ − 4518 ~~~ H for methylated C in CHH context ~~~
+ − 4519 ~~~ h for not methylated C in CHH context ~~~
+ − 4520 ~~~ Z for methylated C in CpG context ~~~
+ − 4521 ~~~ z for not methylated C in CpG context ~~~
+ − 4522 ~~~ U for methylated C in Unknown context (CN or CHN ~~~
+ − 4523 ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~
+ − 4524 ~~~ . for any bases not involving cytosines ~~~
+ − 4525 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0
+ − 4526
+ − 4527 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
+ − 4528 context (this distinction is actually already made in Bismark itself). As the methylation
+ − 4529 information for every C analysed can produce files which easily have tens or even hundreds of
+ − 4530 millions of lines, file sizes can become very large and more difficult to handle. The C
+ − 4531 methylation info additionally splits cytosine methylation calls up into one of the four possible
+ − 4532 strands a given bisulfite read aligned against:
+ − 4533
+ − 4534 OT original top strand
+ − 4535 CTOT complementary to original top strand
+ − 4536
+ − 4537 OB original bottom strand
+ − 4538 CTOB complementary to original bottom strand
+ − 4539
+ − 4540 Thus, by default twelve individual output files are being generated per input file (unless
+ − 4541 --comprehensive is specified, see below). The output files can be imported into a genome
+ − 4542 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
+ − 4543 unless the bisulfite reads were generated preserving directionality it doesn't make any
+ − 4544 sense to look at the data in a strand-specific manner). Strand-specific output files can
+ − 4545 optionally be skipped, in which case only three output files for CpG, CHG or CHH context
+ − 4546 will be generated. For both the strand-specific and comprehensive outputs there is also
+ − 4547 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
+ − 4548
+ − 4549
+ − 4550 The output files are in the following format (tab delimited):
+ − 4551
+ − 4552 <sequence_id> <strand> <chromosome> <position> <methylation call>
+ − 4553
+ − 4554
+ − 4555 USAGE: methylation_extractor [options] <filenames>
+ − 4556
+ − 4557
+ − 4558 ARGUMENTS:
3
+ − 4559 ==========
0
+ − 4560
+ − 4561 <filenames> A space-separated list of Bismark result files in SAM format from
+ − 4562 which methylation information is extracted for every cytosine in
+ − 4563 the reads. For alignment files in the older custom Bismark output
+ − 4564 see option '--vanilla'.
+ − 4565
+ − 4566 OPTIONS:
+ − 4567
+ − 4568 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end
+ − 4569 read data. Specifying either --single-end or --paired-end is
+ − 4570 mandatory.
+ − 4571
+ − 4572 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
+ − 4573 read data. Specifying either --paired-end or --single-end is
+ − 4574 mandatory.
+ − 4575
+ − 4576 --vanilla The Bismark result input file(s) are in the old custom Bismark format
+ − 4577 (up to version 0.5.x) and not in SAM format which is the default as
+ − 4578 of Bismark version 0.6.x or higher. Default: OFF.
+ − 4579
+ − 4580 --no_overlap For paired-end reads it is theoretically possible that read_1 and
+ − 4581 read_2 overlap. This option avoids scoring overlapping methylation
+ − 4582 calls twice (only methylation calls of read 1 are used for in the process
+ − 4583 since read 1 has historically higher quality basecalls than read 2).
+ − 4584 Whilst this option removes a bias towards more methylation calls
+ − 4585 in the center of sequenced fragments it may de facto remove a sizable
+ − 4586 proportion of the data. This option is highly recommended for paired-end
+ − 4587 data.
+ − 4588
3
+ − 4589 --ignore <int> Ignore the first <int> bp from the 5' end of Read 1 when processing the
0
+ − 4590 methylation call string. This can remove e.g. a restriction enzyme site
3
+ − 4591 at the start of each read or any other source of bias (e.g. PBAT-Seq data).
+ − 4592
+ − 4593 --ignore_r2 <int> Ignore the first <int> bp from the 5' end of Read 2 of paired-end sequencing
+ − 4594 results only. Since the first couple of bases in Read 2 of BS-Seq experiments
+ − 4595 show a severe bias towards non-methylation as a result of end-repairing
+ − 4596 sonicated fragments with unmethylated cytosines (see M-bias plot), it is
+ − 4597 recommended that the first couple of bp of Read 2 are removed before
+ − 4598 starting downstream analysis. Please see the section on M-bias plots in the
+ − 4599 Bismark User Guide for more details.
+ − 4600
+ − 4601 --comprehensive Specifying this option will merge all four possible strand-specific
+ − 4602 methylation info into context-dependent output files. The default
+ − 4603
0
+ − 4604 contexts are:
+ − 4605 - CpG context
+ − 4606 - CHG context
+ − 4607 - CHH context
+ − 4608
+ − 4609 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
+ − 4610 strand-specific output files (default) for Cs in
+ − 4611 - CpG context
+ − 4612 - non-CpG context
+ − 4613
+ − 4614 --report Prints out a short methylation summary as well as the paramaters used to run
+ − 4615 this script.
+ − 4616
+ − 4617 --no_header Suppresses the Bismark version header line in all output files for more convenient
+ − 4618 batch processing.
+ − 4619
+ − 4620 -o/--output DIR Allows specification of a different output directory (absolute or relative
+ − 4621 path). If not specified explicitely, the output will be written to the current directory.
+ − 4622
+ − 4623 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
+ − 4624 explicitly if Samtools is in the PATH already.
+ − 4625
+ − 4626 --gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in
+ − 4627 a GZIP compressed form to save disk space. This option does not work on bedGraph and
+ − 4628 genome-wide cytosine reports as they are 'tiny' anyway.
+ − 4629
+ − 4630 --version Displays version information.
+ − 4631
+ − 4632 -h/--help Displays this help file and exits.
+ − 4633
3
+ − 4634 --mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as
+ − 4635 well as a report (optional) and then quit. Default: OFF.
+ − 4636
0
+ − 4637
+ − 4638
+ − 4639 bedGraph specific options:
3
+ − 4640 ==========================
0
+ − 4641
+ − 4642 --bedGraph After finishing the methylation extraction, the methylation output is written into a
+ − 4643 sorted bedGraph file that reports the position of a given cytosine and its methylation
+ − 4644 state (in %, see details below). The methylation extractor output is temporarily split up into
+ − 4645 temporary files, one per chromosome (written into the current directory or folder
+ − 4646 specified with -o/--output); these temp files are then used for sorting and deleted
+ − 4647 afterwards. By default, only cytosines in CpG context will be sorted. The option
+ − 4648 '--CX_context' may be used to report all cytosines irrespective of sequence context
+ − 4649 (this will take MUCH longer!). The default folder for temporary files during the sorting
+ − 4650 process is the output directory. The bedGraph conversion step is performed by the external
+ − 4651 module 'bismark2bedGraph'; this script needs to reside in the same folder as the
+ − 4652 bismark_methylation_extractor itself.
+ − 4653
+ − 4654
+ − 4655 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide
+ − 4656 before its methylation percentage is reported. Default: 1.
+ − 4657
+ − 4658 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
+ − 4659
+ − 4660
+ − 4661 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
+ − 4662 in the experiment irrespective of its sequence context. This applies to both forward and
+ − 4663 reverse strands. Please be aware that this option may generate large temporary and output files
+ − 4664 and may take a long time to sort (up to many hours). Default: OFF.
+ − 4665 (i.e. Default = CpG context only).
+ − 4666
+ − 4667 --buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information.
+ − 4668 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
+ − 4669 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
+ − 4670 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
+ − 4671 Defaults to 2G.
+ − 4672
3
+ − 4673 --scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of
+ − 4674 scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to
+ − 4675 individual chromosome files. These errors were caused by the operating system's limit
+ − 4676 of the number of filehandle that can be written to at any one time (typically 1024; to
+ − 4677 find out this limit on Linux, type: ulimit -a).
+ − 4678 To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort
+ − 4679 methylation calls into individual chromosome files. Instead, all input files are
+ − 4680 temporarily merged into a single file (unless there is only a single file), and this
+ − 4681 file will then be sorted by both chromosome AND position using the Unix sort command.
+ − 4682 Please be aware that this option might take a looooong time to complete, depending on
+ − 4683 the size of the input files, and the memory you allocate to this process (see --buffer_size).
+ − 4684 Nevertheless, it seems to be working.
+ − 4685
+ − 4686 --ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will
+ − 4687 instead use two arrays to sort methylated and unmethylated calls. This may result in a faster
+ − 4688 sorting process of very large files, but this comes at the cost of a larger memory footprint
+ − 4689 (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB
+ − 4690 of RAM). Due to overheads in creating and looping through these arrays it seems that it will
+ − 4691 actually be *slower* for small files (few million alignments), and we are currently testing at
+ − 4692 which point it is advisable to use this option. Note that --ample_memory is not compatible
+ − 4693 with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with).
+ − 4694
+ − 4695
0
+ − 4696
+ − 4697 Genome-wide cytosine methylation report specific options:
3
+ − 4698 =========================================================
0
+ − 4699
+ − 4700 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a
+ − 4701 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based
+ − 4702 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all
+ − 4703 cytosine context is optional). The output considers all Cs on both forward and reverse strands and
+ − 4704 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not
+ − 4705 covered). The cytsoine report conversion step is performed by the external module
+ − 4706 'bedGraph2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor
+ − 4707 itself.
+ − 4708
+ − 4709 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of
+ − 4710 its context. This applies to both forward and reverse strands. Please be aware that this will
+ − 4711 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
+ − 4712 Default: OFF (i.e. Default = CpG context only).
+ − 4713
+ − 4714 --zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF.
+ − 4715
+ − 4716 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
+ − 4717 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
+ − 4718
+ − 4719 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files
+ − 4720 will be named to include the input filename and the chromosome number.
+ − 4721
+ − 4722
+ − 4723
+ − 4724 OUTPUT:
+ − 4725
+ − 4726 The bismark_methylation_extractor output is in the form:
+ − 4727 ========================================================
+ − 4728 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call>
+ − 4729
+ − 4730 * Methylated cytosines receive a '+' orientation,
+ − 4731 * Unmethylated cytosines receive a '-' orientation.
+ − 4732
+ − 4733
+ − 4734
3
+ − 4735 The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords):
+ − 4736 =========================================================================================================
+ − 4737
+ − 4738 track type=bedGraph (header line)
+ − 4739
0
+ − 4740 <chromosome> <start position> <end position> <methylation percentage>
+ − 4741
3
+ − 4742
+ − 4743
+ − 4744 The coverage output looks like this (tab-delimited, 1-based genomic coords):
+ − 4745 ============================================================================
0
+ − 4746
+ − 4747 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated>
+ − 4748
+ − 4749
+ − 4750
+ − 4751 The genome-wide cytosine methylation output file is tab-delimited in the following format:
+ − 4752 ==========================================================================================
+ − 4753 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context>
+ − 4754
+ − 4755
+ − 4756
3
+ − 4757 This script was last modified on 25 November 2013.
0
+ − 4758
+ − 4759 HOW_TO
+ − 4760 }