0
+ − 1 #!/usr/bin/perl
+ − 2 use warnings;
+ − 3 use strict;
+ − 4 $|++;
+ − 5 use Getopt::Long;
+ − 6 use Cwd;
+ − 7 use Carp;
+ − 8
+ − 9 my @filenames; # input files
+ − 10 my %counting;
+ − 11 my $parent_dir = getcwd();
+ − 12
+ − 13 my %fhs;
+ − 14
+ − 15 my $version = 'v0.7.7';
+ − 16 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome) = process_commandline();
+ − 17
+ − 18
+ − 19 ### only needed for bedGraph output
+ − 20 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files
+ − 21 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
+ − 22 my @bedfiles;
+ − 23
+ − 24 ### only needed for genome-wide cytosine methylation report
+ − 25 my %chromosomes;
+ − 26
+ − 27 ##############################################################################################
+ − 28 ### Summarising Run Parameters
+ − 29 ##############################################################################################
+ − 30
+ − 31 ### METHYLATION EXTRACTOR
+ − 32
+ − 33 warn "Summarising Bismark methylation extractor parameters:\n";
+ − 34 warn '='x63,"\n";
+ − 35
+ − 36 if ($single){
+ − 37 if ($vanilla){
+ − 38 warn "Bismark single-end vanilla format specified\n";
+ − 39 }
+ − 40 else{
+ − 41 warn "Bismark single-end SAM format specified (default)\n"; # default
+ − 42 }
+ − 43 }
+ − 44 elsif ($paired){
+ − 45 if ($vanilla){
+ − 46 warn "Bismark paired-end vanilla format specified\n";
+ − 47 }
+ − 48 else{
+ − 49 warn "Bismark paired-end SAM format specified (default)\n"; # default
+ − 50 }
+ − 51 }
+ − 52
+ − 53 if ($ignore){
+ − 54 warn "First $ignore bases will be disregarded when processing the methylation call string\n";
+ − 55 }
+ − 56
+ − 57 if ($full){
+ − 58 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
+ − 59 }
+ − 60 if ($merge_non_CpG){
+ − 61 warn "Merge CHG and CHH context to non-CpG context specified\n";
+ − 62 }
+ − 63 ### output directory
+ − 64 if ($output_dir eq ''){
+ − 65 warn "Output will be written to the current directory ('$parent_dir')\n";
+ − 66 }
+ − 67 else{
+ − 68 warn "Output path specified as: $output_dir\n";
+ − 69 }
+ − 70
+ − 71
+ − 72 sleep (1);
+ − 73
+ − 74 ### BEDGRAPH
+ − 75
+ − 76 if ($bedGraph){
+ − 77 warn "\n\nSummarising bedGraph parameters:\n";
+ − 78 warn '='x63,"\n";
+ − 79
+ − 80 if ($counts){
+ − 81 warn "Generating additional output in bedGraph format including methylating counts (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>)\n";
+ − 82 }
+ − 83 else{
+ − 84 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n";
+ − 85 }
+ − 86
+ − 87 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n";
+ − 88
+ − 89 if ($CX_context){
+ − 90 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n";
+ − 91 }
+ − 92 else{ # default
+ − 93 $CpG_only = 1;
+ − 94 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n";
+ − 95 }
+ − 96
+ − 97 if ($remove){
+ − 98 warn "White spaces in read ID names will be removed prior to sorting\n";
+ − 99 }
+ − 100
+ − 101 sleep (1);
+ − 102
+ − 103 if ($cytosine_report){
+ − 104 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n";
+ − 105 warn '='x63,"\n";
+ − 106 warn "Generating comprehensive genome-wide cytosine report (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> )\n";
+ − 107
+ − 108
+ − 109 if ($CX_context){
+ − 110 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n";
+ − 111 }
+ − 112 else{ # default
+ − 113 $CpG_only = 1;
+ − 114 warn "Reporting cytosine methylation in CpG context only (default)\n";
+ − 115 }
+ − 116
+ − 117 if ($split_by_chromosome){
+ − 118 warn "Splitting the cytosine report output up into individual files for each chromosome\n";
+ − 119 }
+ − 120
+ − 121 ### Zero-based coordinates
+ − 122 if ($zero){
+ − 123 warn "Using zero-based genomic coordinates (user-defined)\n";
+ − 124 }
+ − 125 else{ # default, 1-based coords
+ − 126 warn "Using 1-based genomic coordinates (default)\n";
+ − 127 }
+ − 128
+ − 129 ### GENOME folder
+ − 130 if ($genome_folder){
+ − 131 unless ($genome_folder =~/\/$/){
+ − 132 $genome_folder =~ s/$/\//;
+ − 133 }
+ − 134 warn "Genome folder was specified as $genome_folder\n";
+ − 135 }
+ − 136 else{
+ − 137 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/';
+ − 138 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n";
+ − 139 }
+ − 140 sleep (1);
+ − 141 }
+ − 142 }
+ − 143
+ − 144 warn "\n";
+ − 145 sleep (5);
+ − 146
+ − 147 ######################################################
+ − 148 ### PROCESSING FILES
+ − 149 ######################################################
+ − 150
+ − 151 foreach my $filename (@filenames){
+ − 152 # resetting counters and filehandles
+ − 153 %fhs = ();
+ − 154 %counting =(
+ − 155 total_meCHG_count => 0,
+ − 156 total_meCHH_count => 0,
+ − 157 total_meCpG_count => 0,
+ − 158 total_unmethylated_CHG_count => 0,
+ − 159 total_unmethylated_CHH_count => 0,
+ − 160 total_unmethylated_CpG_count => 0,
+ − 161 sequences_count => 0,
+ − 162 );
+ − 163 @sorting_files = ();
+ − 164 @bedfiles = ();
+ − 165
+ − 166 process_Bismark_results_file($filename);
+ − 167
+ − 168 if ($bedGraph){
+ − 169 my $out = $filename;
+ − 170 $out =~ s/sam$//;
+ − 171 $out =~ s/txt$//;
+ − 172 $out =~ s/$/bedGraph/;
+ − 173
+ − 174 my $bedGraph_output = $out;
+ − 175 open (OUT,'>',$output_dir.$out) or die $!;
+ − 176 # warn "Writing bedGraph to file: $out\n";
+ − 177
+ − 178 process_bedGraph_output();
+ − 179 close OUT or die $!;
+ − 180
+ − 181 ### genome-wide cytosine methylation report requires bedGraph processing anyway
+ − 182 if ($cytosine_report){
+ − 183 my $cytosine_out = $out;
+ − 184 $cytosine_out =~ s/bedGraph$//;
+ − 185
+ − 186 read_genome_into_memory();
+ − 187 generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out);
+ − 188 }
+ − 189 }
+ − 190 }
+ − 191
+ − 192
+ − 193 sub process_commandline{
+ − 194 my $help;
+ − 195 my $single_end;
+ − 196 my $paired_end;
+ − 197 my $ignore;
+ − 198 my $genomic_fasta;
+ − 199 my $full;
+ − 200 my $report;
+ − 201 my $extractor_version;
+ − 202 my $no_overlap;
+ − 203 my $merge_non_CpG;
+ − 204 my $vanilla;
+ − 205 my $output_dir;
+ − 206 my $no_header;
+ − 207 my $bedGraph;
+ − 208 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status
+ − 209 my $remove;
+ − 210 my $counts;
+ − 211 my $cytosine_report;
+ − 212 my $genome_folder;
+ − 213 my $zero;
+ − 214 my $CpG_only;
+ − 215 my $CX_context;
+ − 216 my $split_by_chromosome;
+ − 217
+ − 218
+ − 219 my $command_line = GetOptions ('help|man' => \$help,
+ − 220 'p|paired-end' => \$paired_end,
+ − 221 's|single-end' => \$single_end,
+ − 222 'fasta' => \$genomic_fasta,
+ − 223 'ignore=i' => \$ignore,
+ − 224 'comprehensive' => \$full,
+ − 225 'report' => \$report,
+ − 226 'version' => \$extractor_version,
+ − 227 'no_overlap' => \$no_overlap,
+ − 228 'merge_non_CpG' => \$merge_non_CpG,
+ − 229 'vanilla' => \$vanilla,
+ − 230 'o|output=s' => \$output_dir,
+ − 231 'no_header' => \$no_header,
+ − 232 'bedGraph' => \$bedGraph,
+ − 233 "cutoff=i" => \$coverage_threshold,
+ − 234 "remove_spaces" => \$remove,
+ − 235 "counts" => \$counts,
+ − 236 "cytosine_report" => \$cytosine_report,
+ − 237 'g|genome_folder=s' => \$genome_folder,
+ − 238 "zero_based" => \$zero,
+ − 239 "CX|CX_context" => \$CX_context,
+ − 240 "split_by_chromosome" => \$split_by_chromosome,
+ − 241 );
+ − 242
+ − 243 ### EXIT ON ERROR if there were errors with any of the supplied options
+ − 244 unless ($command_line){
+ − 245 die "Please respecify command line options\n";
+ − 246 }
+ − 247
+ − 248 ### HELPFILE
+ − 249 if ($help){
+ − 250 print_helpfile();
+ − 251 exit;
+ − 252 }
+ − 253
+ − 254 if ($extractor_version){
+ − 255 print << "VERSION";
+ − 256
+ − 257
+ − 258 Bismark Methylation Extractor
+ − 259
+ − 260 Bismark Extractor Version: $version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
+ − 261 www.bioinformatics.babraham.ac.uk/projects/bismark/
+ − 262
+ − 263
+ − 264 VERSION
+ − 265 exit;
+ − 266 }
+ − 267
+ − 268
+ − 269 ### no files provided
+ − 270 unless (@ARGV){
+ − 271 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
+ − 272 }
+ − 273 @filenames = @ARGV;
+ − 274
+ − 275 warn "\n *** Bismark methylation extractor version $version ***\n\n";
+ − 276
+ − 277 ### IGNORING <INT> bases at the start of the read when processing the methylation call string
+ − 278 unless ($ignore){
+ − 279 $ignore = 0;
+ − 280 }
+ − 281 ### PRINT A REPORT
+ − 282 unless ($report){
+ − 283 $report = 0;
+ − 284 }
+ − 285
+ − 286 ### OUTPUT DIR PATH
+ − 287 if ($output_dir){
+ − 288 unless ($output_dir =~ /\/$/){
+ − 289 $output_dir =~ s/$/\//;
+ − 290 }
+ − 291 }
+ − 292 else{
+ − 293 $output_dir = '';
+ − 294 }
+ − 295
+ − 296 ### NO HEADER
+ − 297 unless ($no_header){
+ − 298 $no_header = 0;
+ − 299 }
+ − 300
+ − 301 ### OLD (VANILLA) OUTPUT FORMAT
+ − 302 unless ($vanilla){
+ − 303 $vanilla = 0;
+ − 304 }
+ − 305
+ − 306 if ($single_end){
+ − 307 $paired_end = 0; ### SINGLE END ALIGNMENTS
+ − 308 }
+ − 309 elsif ($paired_end){
+ − 310 $single_end = 0; ### PAIRED-END ALIGNMENTS
+ − 311 }
+ − 312 else{
+ − 313 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n";
+ − 314 }
+ − 315
+ − 316 ### NO OVERLAP
+ − 317 if ($no_overlap){
+ − 318 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
+ − 319 }
+ − 320 else{
+ − 321 $no_overlap = 0;
+ − 322 }
+ − 323
+ − 324 ### COMPREHENSIVE OUTPUT
+ − 325 unless ($full){
+ − 326 $full = 0;
+ − 327 }
+ − 328
+ − 329 ### MERGE NON-CpG context
+ − 330 unless ($merge_non_CpG){
+ − 331 $merge_non_CpG = 0;
+ − 332 }
+ − 333
+ − 334 ### remove white spaces in read ID (needed for sorting using the sort command
+ − 335 unless ($remove){
+ − 336 $remove = 0;
+ − 337 }
+ − 338
+ − 339 ### COVERAGE THRESHOLD FOR gedGraph OUTPUT
+ − 340 unless (defined $coverage_threshold){
+ − 341 unless ($coverage_threshold > 0){
+ − 342 die "Please select a coverage greater than 0 (positive integers only)\n";
+ − 343 }
+ − 344 $coverage_threshold = 1;
+ − 345 }
+ − 346
+ − 347 if ($zero){
+ − 348 die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report);
+ − 349 }
+ − 350
+ − 351 if ($CX_context){
+ − 352 die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
+ − 353 }
+ − 354
+ − 355 if ($cytosine_report){
+ − 356
+ − 357 ### GENOME folder
+ − 358 if ($genome_folder){
+ − 359 unless ($genome_folder =~/\/$/){
+ − 360 $genome_folder =~ s/$/\//;
+ − 361 }
+ − 362 }
+ − 363 else{
+ − 364 die "Please specify a genome folder to proceed (full path only)\n";
+ − 365 }
+ − 366
+ − 367 unless ($bedGraph){
+ − 368 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n";
+ − 369 $bedGraph = 1;
+ − 370 }
+ − 371 unless ($counts){
+ − 372 warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n";
+ − 373 $counts = 1;
+ − 374 }
+ − 375 warn "\n";
+ − 376 }
+ − 377
+ − 378 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome);
+ − 379 }
+ − 380
+ − 381
+ − 382 sub process_Bismark_results_file{
+ − 383 my $filename = shift;
+ − 384
+ − 385 warn "\nNow reading in Bismark result file $filename\n\n";
+ − 386
+ − 387 if ($filename =~ /\.gz$/) {
+ − 388 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
+ − 389 } else {
+ − 390 open (IN,$filename) or die "Can't open file $filename: $!\n";
+ − 391 }
+ − 392
+ − 393 ### Vanilla and SAM output need to read different numbers of header lines
+ − 394 if ($vanilla) {
+ − 395 my $bismark_version = <IN>; ## discarding the Bismark version info
+ − 396 chomp $bismark_version;
+ − 397 $bismark_version =~ s/\r//; # replaces \r line feed
+ − 398 $bismark_version =~ s/Bismark version: //;
+ − 399 if ($bismark_version =~ /^\@/) {
+ − 400 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
+ − 401 sleep (2);
+ − 402 }
+ − 403
+ − 404 unless ($version eq $bismark_version){
+ − 405 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
+ − 406 }
+ − 407 } else {
+ − 408 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
+ − 409 # We are reading from it further down
+ − 410 }
+ − 411
+ − 412 my $output_filename = (split (/\//,$filename))[-1];
+ − 413
+ − 414 ### OPENING OUTPUT-FILEHANDLES
+ − 415 if ($report) {
+ − 416 my $report_filename = $output_filename;
+ − 417 $report_filename =~ s/[\.sam|\.txt]$//;
+ − 418 $report_filename =~ s/$/_splitting_report.txt/;
+ − 419 $report_filename = $output_dir . $report_filename;
+ − 420 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
+ − 421 }
+ − 422
+ − 423 if ($report) {
+ − 424 print REPORT "$output_filename\n\n";
+ − 425 print REPORT "Parameters used to extract methylation information:\n";
+ − 426 if ($paired) {
+ − 427 if ($vanilla) {
+ − 428 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
+ − 429 } else {
+ − 430 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
+ − 431 }
+ − 432 }
+ − 433
+ − 434 if ($single) {
+ − 435 if ($vanilla) {
+ − 436 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
+ − 437 } else {
+ − 438 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
+ − 439 }
+ − 440 }
+ − 441
+ − 442 if ($ignore) {
+ − 443 print REPORT "Ignoring first $ignore bases\n";
+ − 444 }
+ − 445
+ − 446 if ($full) {
+ − 447 print REPORT "Output specified: comprehensive\n";
+ − 448 } else {
+ − 449 print REPORT "Output specified: strand-specific (default)\n";
+ − 450 }
+ − 451
+ − 452 if ($no_overlap) {
+ − 453 print REPORT "No overlapping methylation calls specified\n";
+ − 454 }
+ − 455 if ($genomic_fasta) {
+ − 456 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
+ − 457 }
+ − 458 if ($merge_non_CpG) {
+ − 459 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
+ − 460 }
+ − 461
+ − 462 print REPORT "\n";
+ − 463 }
+ − 464
+ − 465 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
+ − 466 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
+ − 467 if ($full and $merge_non_CpG) {
+ − 468 my $cpg_output = my $other_c_output = $output_filename;
+ − 469 ### C in CpG context
+ − 470 $cpg_output =~ s/^/CpG_context_/;
+ − 471 $cpg_output =~ s/sam$/txt/;
+ − 472 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
+ − 473 $cpg_output = $output_dir . $cpg_output;
+ − 474 push @sorting_files,$cpg_output;
+ − 475 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
+ − 476 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
+ − 477
+ − 478 unless ($no_header) {
+ − 479 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
+ − 480 }
+ − 481
+ − 482 ### C in any other context than CpG
+ − 483 $other_c_output =~ s/^/Non_CpG_context_/;
+ − 484 $other_c_output =~ s/sam$/txt/;
+ − 485 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
+ − 486 $other_c_output = $output_dir . $other_c_output;
+ − 487 push @sorting_files,$other_c_output;
+ − 488 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n";
+ − 489 print "Writing result file containing methylation information for C in any other context to $other_c_output\n";
+ − 490
+ − 491 unless ($no_header) {
+ − 492 print {$fhs{other_context}} "Bismark methylation extractor version $version\n";
+ − 493 }
+ − 494 }
+ − 495
+ − 496 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
+ − 497 elsif ($merge_non_CpG) {
+ − 498
+ − 499 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
+ − 500
+ − 501 ### For cytosines in CpG context
+ − 502 $cpg_ot =~ s/^/CpG_OT_/;
+ − 503 $cpg_ot =~ s/sam$/txt/;
+ − 504 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
+ − 505 $cpg_ot = $output_dir . $cpg_ot;
+ − 506 push @sorting_files,$cpg_ot;
+ − 507 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
+ − 508 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
+ − 509
+ − 510 unless($no_header){
+ − 511 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
+ − 512 }
+ − 513
+ − 514 $cpg_ctot =~ s/^/CpG_CTOT_/;
+ − 515 $cpg_ctot =~ s/sam$/txt/;
+ − 516 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
+ − 517 $cpg_ctot = $output_dir . $cpg_ctot;
+ − 518 push @sorting_files,$cpg_ctot;
+ − 519 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
+ − 520 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
+ − 521
+ − 522 unless($no_header){
+ − 523 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
+ − 524 }
+ − 525
+ − 526 $cpg_ctob =~ s/^/CpG_CTOB_/;
+ − 527 $cpg_ctob =~ s/sam$/txt/;
+ − 528 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
+ − 529 $cpg_ctob = $output_dir . $cpg_ctob;
+ − 530 push @sorting_files,$cpg_ctob;
+ − 531 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
+ − 532 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
+ − 533
+ − 534 unless($no_header){
+ − 535 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
+ − 536 }
+ − 537
+ − 538 $cpg_ob =~ s/^/CpG_OB_/;
+ − 539 $cpg_ob =~ s/sam$/txt/;
+ − 540 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
+ − 541 $cpg_ob = $output_dir . $cpg_ob;
+ − 542 push @sorting_files,$cpg_ob;
+ − 543 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
+ − 544 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
+ − 545
+ − 546 unless($no_header){
+ − 547 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
+ − 548 }
+ − 549
+ − 550 ### For cytosines in Non-CpG (CC, CT or CA) context
+ − 551 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
+ − 552
+ − 553 $other_c_ot =~ s/^/Non_CpG_OT_/;
+ − 554 $other_c_ot =~ s/sam$/txt/;
+ − 555 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
+ − 556 $other_c_ot = $output_dir . $other_c_ot;
+ − 557 push @sorting_files,$other_c_ot;
+ − 558 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n";
+ − 559 print "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n";
+ − 560
+ − 561 unless($no_header){
+ − 562 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n";
+ − 563 }
+ − 564
+ − 565 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
+ − 566 $other_c_ctot =~ s/sam$/txt/;
+ − 567 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
+ − 568 $other_c_ctot = $output_dir . $other_c_ctot;
+ − 569 push @sorting_files,$other_c_ctot;
+ − 570 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n";
+ − 571 print "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n";
+ − 572
+ − 573 unless($no_header){
+ − 574 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n";
+ − 575 }
+ − 576
+ − 577 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
+ − 578 $other_c_ctob =~ s/sam$/txt/;
+ − 579 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
+ − 580 $other_c_ctob = $output_dir . $other_c_ctob;
+ − 581 push @sorting_files,$other_c_ctob;
+ − 582 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n";
+ − 583 print "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n";
+ − 584
+ − 585 unless($no_header){
+ − 586 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n";
+ − 587 }
+ − 588
+ − 589 $other_c_ob =~ s/^/Non_CpG_OB_/;
+ − 590 $other_c_ob =~ s/sam$/txt/;
+ − 591 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
+ − 592 $other_c_ob = $output_dir . $other_c_ob;
+ − 593 push @sorting_files,$other_c_ob;
+ − 594 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n";
+ − 595 print "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n";
+ − 596
+ − 597 unless($no_header){
+ − 598 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n";
+ − 599 }
+ − 600 }
+ − 601 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
+ − 602
+ − 603 ### if --comprehensive was specified we are only writing one file per context
+ − 604 elsif ($full) {
+ − 605 my $cpg_output = my $chg_output = my $chh_output = $output_filename;
+ − 606 ### C in CpG context
+ − 607 $cpg_output =~ s/^/CpG_context_/;
+ − 608 $cpg_output =~ s/sam$/txt/;
+ − 609 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
+ − 610 $cpg_output = $output_dir . $cpg_output;
+ − 611 push @sorting_files,$cpg_output;
+ − 612 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
+ − 613 print "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
+ − 614
+ − 615 unless($no_header){
+ − 616 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
+ − 617 }
+ − 618
+ − 619 ### C in CHG context
+ − 620 $chg_output =~ s/^/CHG_context_/;
+ − 621 $chg_output =~ s/sam$/txt/;
+ − 622 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
+ − 623 $chg_output = $output_dir . $chg_output;
+ − 624 push @sorting_files,$chg_output;
+ − 625 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n";
+ − 626 print "Writing result file containing methylation information for C in CHG context to $chg_output\n";
+ − 627
+ − 628 unless($no_header){
+ − 629 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n";
+ − 630 }
+ − 631
+ − 632 ### C in CHH context
+ − 633 $chh_output =~ s/^/CHH_context_/;
+ − 634 $chh_output =~ s/sam$/txt/;
+ − 635 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
+ − 636 $chh_output = $output_dir . $chh_output;
+ − 637 push @sorting_files, $chh_output;
+ − 638 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n";
+ − 639 print "Writing result file containing methylation information for C in CHH context to $chh_output\n";
+ − 640
+ − 641 unless($no_header){
+ − 642 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n";
+ − 643 }
+ − 644 }
+ − 645 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
+ − 646 else {
+ − 647 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
+ − 648
+ − 649 ### For cytosines in CpG context
+ − 650 $cpg_ot =~ s/^/CpG_OT_/;
+ − 651 $cpg_ot =~ s/sam$/txt/;
+ − 652 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
+ − 653 $cpg_ot = $output_dir . $cpg_ot;
+ − 654 push @sorting_files,$cpg_ot;
+ − 655 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
+ − 656 print "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
+ − 657
+ − 658 unless($no_header){
+ − 659 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
+ − 660 }
+ − 661
+ − 662 $cpg_ctot =~ s/^/CpG_CTOT_/;
+ − 663 $cpg_ctot =~ s/sam$/txt/;
+ − 664 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
+ − 665 $cpg_ctot = $output_dir . $cpg_ctot;
+ − 666 push @sorting_files,$cpg_ctot;
+ − 667 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
+ − 668 print "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
+ − 669
+ − 670 unless($no_header){
+ − 671 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
+ − 672 }
+ − 673
+ − 674 $cpg_ctob =~ s/^/CpG_CTOB_/;
+ − 675 $cpg_ctob =~ s/sam$/txt/;
+ − 676 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
+ − 677 $cpg_ctob = $output_dir . $cpg_ctob;
+ − 678 push @sorting_files,$cpg_ctob;
+ − 679 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
+ − 680 print "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
+ − 681
+ − 682 unless($no_header){
+ − 683 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
+ − 684 }
+ − 685
+ − 686 $cpg_ob =~ s/^/CpG_OB_/;
+ − 687 $cpg_ob =~ s/sam$/txt/;
+ − 688 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
+ − 689 $cpg_ob = $output_dir . $cpg_ob;
+ − 690 push @sorting_files,$cpg_ob;
+ − 691 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
+ − 692 print "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
+ − 693
+ − 694 unless($no_header){
+ − 695 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
+ − 696 }
+ − 697
+ − 698 ### For cytosines in CHG context
+ − 699 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
+ − 700
+ − 701 $chg_ot =~ s/^/CHG_OT_/;
+ − 702 $chg_ot =~ s/sam$/txt/;
+ − 703 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
+ − 704 $chg_ot = $output_dir . $chg_ot;
+ − 705 push @sorting_files,$chg_ot;
+ − 706 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n";
+ − 707 print "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n";
+ − 708
+ − 709 unless($no_header){
+ − 710 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n";
+ − 711 }
+ − 712
+ − 713 $chg_ctot =~ s/^/CHG_CTOT_/;
+ − 714 $chg_ctot =~ s/sam$/txt/;
+ − 715 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
+ − 716 $chg_ctot = $output_dir . $chg_ctot;
+ − 717 push @sorting_files,$chg_ctot;
+ − 718 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n";
+ − 719 print "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n";
+ − 720
+ − 721 unless($no_header){
+ − 722 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n";
+ − 723 }
+ − 724
+ − 725 $chg_ctob =~ s/^/CHG_CTOB_/;
+ − 726 $chg_ctob =~ s/sam$/txt/;
+ − 727 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
+ − 728 $chg_ctob = $output_dir . $chg_ctob;
+ − 729 push @sorting_files,$chg_ctob;
+ − 730 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n";
+ − 731 print "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n";
+ − 732
+ − 733 unless($no_header){
+ − 734 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n";
+ − 735 }
+ − 736
+ − 737 $chg_ob =~ s/^/CHG_OB_/;
+ − 738 $chg_ob =~ s/sam$/txt/;
+ − 739 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
+ − 740 $chg_ob = $output_dir . $chg_ob;
+ − 741 push @sorting_files,$chg_ob;
+ − 742 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n";
+ − 743 print "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n";
+ − 744
+ − 745 unless($no_header){
+ − 746 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n";
+ − 747 }
+ − 748
+ − 749 ### For cytosines in CHH context
+ − 750 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
+ − 751
+ − 752 $chh_ot =~ s/^/CHH_OT_/;
+ − 753 $chh_ot =~ s/sam$/txt/;
+ − 754 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
+ − 755 $chh_ot = $output_dir . $chh_ot;
+ − 756 push @sorting_files,$chh_ot;
+ − 757 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n";
+ − 758 print "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n";
+ − 759
+ − 760 unless($no_header){
+ − 761 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n";
+ − 762 }
+ − 763
+ − 764 $chh_ctot =~ s/^/CHH_CTOT_/;
+ − 765 $chh_ctot =~ s/sam$/txt/;
+ − 766 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
+ − 767 $chh_ctot = $output_dir . $chh_ctot;
+ − 768 push @sorting_files,$chh_ctot;
+ − 769 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n";
+ − 770 print "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n";
+ − 771
+ − 772 unless($no_header){
+ − 773 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n";
+ − 774 }
+ − 775
+ − 776 $chh_ctob =~ s/^/CHH_CTOB_/;
+ − 777 $chh_ctob =~ s/sam$/txt/;
+ − 778 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
+ − 779 $chh_ctob = $output_dir . $chh_ctob;
+ − 780 push @sorting_files,$chh_ctob;
+ − 781 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n";
+ − 782 print "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n";
+ − 783
+ − 784 unless($no_header){
+ − 785 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n";
+ − 786 }
+ − 787
+ − 788 $chh_ob =~ s/^/CHH_OB_/;
+ − 789 $chh_ob =~ s/sam$/txt/;
+ − 790 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
+ − 791 $chh_ob = $output_dir . $chh_ob;
+ − 792 push @sorting_files,$chh_ob;
+ − 793 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n";
+ − 794 print "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n";
+ − 795
+ − 796 unless($no_header){
+ − 797 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n";
+ − 798 }
+ − 799 }
+ − 800
+ − 801 my $methylation_call_strings_processed = 0;
+ − 802 my $line_count = 0;
+ − 803
+ − 804 ### proceeding differently now for single-end or paired-end Bismark files
+ − 805
+ − 806 ### PROCESSING SINGLE-END RESULT FILES
+ − 807 if ($single) {
+ − 808
+ − 809 ### also proceeding differently now for SAM format or vanilla Bismark format files
+ − 810 if ($vanilla) { # old vanilla Bismark output format
+ − 811 while (<IN>) {
+ − 812 ++$line_count;
+ − 813 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
+ − 814
+ − 815 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
+ − 816 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
+ − 817
+ − 818 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
+ − 819 ### last position
+ − 820 chomp $genome_conversion;
+ − 821
+ − 822 my $index;
+ − 823 if ($meth_call) {
+ − 824
+ − 825 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
+ − 826 $index = 0;
+ − 827 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
+ − 828 $index = 1;
+ − 829 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
+ − 830 $index = 3;
+ − 831 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
+ − 832 $index = 2;
+ − 833 } else {
+ − 834 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
+ − 835 }
+ − 836
+ − 837 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
+ − 838 if ($ignore) {
+ − 839 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
+ − 840
+ − 841 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
+ − 842 if ($strand eq '+') {
+ − 843 $start += $ignore;
+ − 844 } elsif ($strand eq '-') {
+ − 845 $start += length($meth_call)-1; ## $meth_call is already shortened!
+ − 846 } else {
+ − 847 die "Alignment did not have proper strand information: $strand\n";
+ − 848 }
+ − 849 }
+ − 850 ### printing out the methylation state of every C in the read
+ − 851 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
+ − 852
+ − 853 ++$methylation_call_strings_processed; # 1 per single-end result
+ − 854 }
+ − 855 }
+ − 856 } else { # processing single-end SAM format (default)
+ − 857 while (<IN>) {
+ − 858 ### SAM format can either start with header lines (starting with @) or start with alignments directly
+ − 859 if (/^\@/) { # skipping header lines (starting with @)
+ − 860 warn "skipping SAM header line:\t$_";
+ − 861 next;
+ − 862 }
+ − 863
+ − 864 ++$line_count;
+ − 865 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
+ − 866
+ − 867 # example read in SAM format
+ − 868 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
+ − 869 ###
+ − 870
+ − 871 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
+ − 872 # < 0.7.6 $meth_call =~ s/^XM:Z://;
+ − 873 # < 0.7.6 $read_conversion =~ s/^XR:Z://;
+ − 874 # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
+ − 875
+ − 876 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
+ − 877
+ − 878 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
+ − 879 my $meth_call; ### Thanks to Zachary Zeno for this solution
+ − 880 my $read_conversion;
+ − 881 my $genome_conversion;
+ − 882
+ − 883 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
+ − 884 my $tag = $1;
+ − 885 my $value = $2;
+ − 886
+ − 887 if ($tag eq "XM") {
+ − 888 $meth_call = $value;
+ − 889 $meth_call =~ s/\r//;
+ − 890 } elsif ($tag eq "XR") {
+ − 891 $read_conversion = $value;
+ − 892 $read_conversion =~ s/\r//;
+ − 893 } elsif ($tag eq "XG") {
+ − 894 $genome_conversion = $value;
+ − 895 $genome_conversion =~ s/\r//;
+ − 896 }
+ − 897 }
+ − 898
+ − 899 my $strand;
+ − 900 chomp $genome_conversion;
+ − 901 # print "$meth_call\n$read_conversion\n$genome_conversion\n";
+ − 902
+ − 903 my $index;
+ − 904 if ($meth_call) {
+ − 905 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
+ − 906 $index = 0;
+ − 907 $strand = '+';
+ − 908 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
+ − 909 $index = 1;
+ − 910 $strand = '-';
+ − 911 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
+ − 912 $index = 2;
+ − 913 $strand = '+';
+ − 914 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
+ − 915 $index = 3;
+ − 916 $strand = '-';
+ − 917 } else {
+ − 918 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
+ − 919 }
+ − 920
+ − 921 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
+ − 922 if ($strand eq '-') {
+ − 923 $meth_call = reverse $meth_call;
+ − 924 }
+ − 925
+ − 926 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
+ − 927 if ($ignore) {
+ − 928 # print "\n\n$meth_call\n";
+ − 929 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
+ − 930 # print "$meth_call\n";
+ − 931 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
+ − 932
+ − 933 my @len = split (/\D+/,$cigar); # storing the length per operation
+ − 934 my @ops = split (/\d+/,$cigar); # storing the operation
+ − 935 shift @ops; # remove the empty first element
+ − 936 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+ − 937
+ − 938 my @comp_cigar; # building an array with all CIGAR operations
+ − 939 foreach my $index (0..$#len) {
+ − 940 foreach (1..$len[$index]) {
+ − 941 # print "$ops[$index]";
+ − 942 push @comp_cigar, $ops[$index];
+ − 943 }
+ − 944 }
+ − 945 # print "original CIGAR: $cigar\n";
+ − 946 # print "original CIGAR: @comp_cigar\n";
+ − 947
+ − 948 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
+ − 949 if ($strand eq '+') {
+ − 950
+ − 951 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
+ − 952 my $I_count = 0;
+ − 953
+ − 954 for (1..$ignore) {
+ − 955 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
+ − 956 # print "$_ deleted $op\n";
+ − 957
+ − 958 while ($op eq 'D') { # repeating this for deletions (D)
+ − 959 $D_count++;
+ − 960 $op = shift @comp_cigar;
+ − 961 # print "$_ deleted $op\n";
+ − 962 }
+ − 963 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
+ − 964 $I_count++;
+ − 965 }
+ − 966 }
+ − 967 $start += $ignore + $D_count - $I_count;
+ − 968 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
+ − 969 } elsif ($strand eq '-') {
+ − 970
+ − 971 for (1..$ignore) {
+ − 972 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
+ − 973 while ($op eq 'D') { # repeating this for deletions (D)
+ − 974 $op = pop @comp_cigar;
+ − 975 }
+ − 976 }
+ − 977
+ − 978 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
+ − 979 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
+ − 980 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
+ − 981 foreach (@comp_cigar) {
+ − 982 ++$MD_count if ($_ eq 'M' or $_ eq 'D');
+ − 983 }
+ − 984 $start += $MD_count - 1;
+ − 985 }
+ − 986
+ − 987 ### reconstituting shortened CIGAR string
+ − 988 my $new_cigar;
+ − 989 my $count = 0;
+ − 990 my $last_op;
+ − 991 # print "ignore adjusted: @comp_cigar\n";
+ − 992 foreach my $op (@comp_cigar) {
+ − 993 unless (defined $last_op){
+ − 994 $last_op = $op;
+ − 995 ++$count;
+ − 996 next;
+ − 997 }
+ − 998 if ($last_op eq $op) {
+ − 999 ++$count;
+ − 1000 } else {
+ − 1001 $new_cigar .= "$count$last_op";
+ − 1002 $last_op = $op;
+ − 1003 $count = 1;
+ − 1004 }
+ − 1005 }
+ − 1006 $new_cigar .= "$count$last_op"; # appending the last operation and count
+ − 1007 $cigar = $new_cigar;
+ − 1008 # print "ignore adjusted scalar: $cigar\n";
+ − 1009 }
+ − 1010 }
+ − 1011 ### printing out the methylation state of every C in the read
+ − 1012 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
+ − 1013
+ − 1014 ++$methylation_call_strings_processed; # 1 per single-end result
+ − 1015 }
+ − 1016 }
+ − 1017 }
+ − 1018
+ − 1019 ### PROCESSING PAIRED-END RESULT FILES
+ − 1020 elsif ($paired) {
+ − 1021
+ − 1022 ### proceeding differently now for SAM format or vanilla Bismark format files
+ − 1023 if ($vanilla) { # old vanilla Bismark paired-end output format
+ − 1024 while (<IN>) {
+ − 1025 ++$line_count;
+ − 1026 warn "processed line: $line_count\n" if ($line_count%500000==0);
+ − 1027
+ − 1028 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
+ − 1029 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
+ − 1030
+ − 1031 my $index;
+ − 1032 chomp $genome_conversion;
+ − 1033
+ − 1034 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
+ − 1035 $index = 0; ## this is OT
+ − 1036 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
+ − 1037 $index = 2; ## this is CTOB!!!
+ − 1038 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
+ − 1039 $index = 1; ## this is CTOT!!!
+ − 1040 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
+ − 1041 $index = 3; ## this is OB
+ − 1042 } else {
+ − 1043 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
+ − 1044 }
+ − 1045
+ − 1046 if ($meth_call_1 and $meth_call_2) {
+ − 1047 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
+ − 1048 if ($ignore) {
+ − 1049 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
+ − 1050 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
+ − 1051
+ − 1052 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
+ − 1053 $start_read_1 += $ignore;
+ − 1054 $end_read_2 -= $ignore;
+ − 1055 }
+ − 1056 my $end_read_1;
+ − 1057 my $start_read_2;
+ − 1058
+ − 1059 if ($strand eq '+') {
+ − 1060
+ − 1061 $end_read_1 = $start_read_1+length($meth_call_1)-1;
+ − 1062 $start_read_2 = $end_read_2-length($meth_call_2)+1;
+ − 1063
+ − 1064 ## we first pass the first read which is in + orientation on the forward strand
+ − 1065 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0);
+ − 1066
+ − 1067 # we next pass the second read which is in - orientation on the reverse strand
+ − 1068 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
+ − 1069 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1);
+ − 1070 } else {
+ − 1071
+ − 1072 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
+ − 1073 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
+ − 1074
+ − 1075 ## we first pass the first read which is in - orientation on the reverse strand
+ − 1076 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0);
+ − 1077
+ − 1078 # we next pass the second read which is in + orientation on the forward strand
+ − 1079 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
+ − 1080 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2);
+ − 1081 }
+ − 1082
+ − 1083 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
+ − 1084 }
+ − 1085 }
+ − 1086 } else { # Bismark paired-end SAM output format (default)
+ − 1087 while (<IN>) {
+ − 1088 ### SAM format can either start with header lines (starting with @) or start with alignments directly
+ − 1089 if (/^\@/) { # skipping header lines (starting with @)
+ − 1090 warn "skipping SAM header line:\t$_";
+ − 1091 next;
+ − 1092 }
+ − 1093
+ − 1094 ++$line_count;
+ − 1095 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
+ − 1096
+ − 1097 # example paired-end reads in SAM format (2 consecutive lines)
+ − 1098 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
+ − 1099 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
+ − 1100
+ − 1101 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
+ − 1102
+ − 1103 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
+ − 1104 my $meth_call_1;
+ − 1105 my $first_read_conversion;
+ − 1106 my $genome_conversion;
+ − 1107
+ − 1108 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
+ − 1109 my $tag = $1;
+ − 1110 my $value = $2;
+ − 1111
+ − 1112 if ($tag eq "XM") {
+ − 1113 $meth_call_1 = $value;
+ − 1114 $meth_call_1 =~ s/\r//;
+ − 1115 } elsif ($tag eq "XR") {
+ − 1116 $first_read_conversion = $value;
+ − 1117 $first_read_conversion =~ s/\r//;
+ − 1118 } elsif ($tag eq "XG") {
+ − 1119 $genome_conversion = $value;
+ − 1120 $genome_conversion =~ s/\r//;
+ − 1121 }
+ − 1122 }
+ − 1123
+ − 1124 $_ = <IN>; # reading in the paired read
+ − 1125
+ − 1126 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14];
+ − 1127 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://;
+ − 1128 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://;
+ − 1129 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://;
+ − 1130 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://;
+ − 1131
+ − 1132 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
+ − 1133
+ − 1134 my $meth_call_2;
+ − 1135 my $second_read_conversion;
+ − 1136
+ − 1137 while ( /(XM|XR):Z:([^\t]+)/g ) {
+ − 1138 my $tag = $1;
+ − 1139 my $value = $2;
+ − 1140
+ − 1141 if ($tag eq "XM") {
+ − 1142 $meth_call_2 = $value;
+ − 1143 $meth_call_2 =~ s/\r//;
+ − 1144 } elsif ($tag eq "XR") {
+ − 1145 $second_read_conversion = $value;
+ − 1146 $second_read_conversion = s/\r//;
+ − 1147 }
+ − 1148 }
+ − 1149
+ − 1150 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
+ − 1151 chomp $genome_conversion; # in case it captured a new line character
+ − 1152
+ − 1153 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
+ − 1154
+ − 1155 my $index;
+ − 1156 my $strand;
+ − 1157
+ − 1158 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
+ − 1159 $index = 0; ## this is OT
+ − 1160 $strand = '+';
+ − 1161 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
+ − 1162 $index = 1; ## this is CTOT
+ − 1163 $strand = '-';
+ − 1164 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
+ − 1165 $index = 2; ## this is CTOB
+ − 1166 $strand = '+';
+ − 1167 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
+ − 1168 $index = 3; ## this is OB
+ − 1169 $strand = '-';
+ − 1170 } else {
+ − 1171 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
+ − 1172 }
+ − 1173
+ − 1174 ### reversing the methylation call of the read that was reverse-complemented
+ − 1175 if ($strand eq '+') {
+ − 1176 $meth_call_2 = reverse $meth_call_2;
+ − 1177 } else {
+ − 1178 $meth_call_1 = reverse $meth_call_1;
+ − 1179 }
+ − 1180
+ − 1181 if ($meth_call_1 and $meth_call_2) {
+ − 1182
+ − 1183 my $end_read_1;
+ − 1184
+ − 1185 ### READ 1
+ − 1186 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
+ − 1187 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
+ − 1188 shift @ops_1; # remove the empty first element
+ − 1189 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
+ − 1190
+ − 1191 my @comp_cigar_1; # building an array with all CIGAR operations
+ − 1192 foreach my $index (0..$#len_1) {
+ − 1193 foreach (1..$len_1[$index]) {
+ − 1194 # print "$ops_1[$index]";
+ − 1195 push @comp_cigar_1, $ops_1[$index];
+ − 1196 }
+ − 1197 }
+ − 1198 # print "original CIGAR read 1: $cigar_1\n";
+ − 1199 # print "original CIGAR read 1: @comp_cigar_1\n";
+ − 1200
+ − 1201 ### READ 2
+ − 1202 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
+ − 1203 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
+ − 1204 shift @ops_2; # remove the empty first element
+ − 1205 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
+ − 1206 my @comp_cigar_2; # building an array with all CIGAR operations for read 2
+ − 1207 foreach my $index (0..$#len_2) {
+ − 1208 foreach (1..$len_2[$index]) {
+ − 1209 # print "$ops_2[$index]";
+ − 1210 push @comp_cigar_2, $ops_2[$index];
+ − 1211 }
+ − 1212 }
+ − 1213 # print "original CIGAR read 2: $cigar_2\n";
+ − 1214 # print "original CIGAR read 2: @comp_cigar_2\n";
+ − 1215
+ − 1216 if ($ignore) {
+ − 1217 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
+ − 1218 ### the methylation calls have already been reversed where necessary
+ − 1219 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
+ − 1220 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
+ − 1221
+ − 1222 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
+ − 1223
+ − 1224 if ($strand eq '+') {
+ − 1225
+ − 1226 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
+ − 1227 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
+ − 1228 my $I_count_1 = 0;
+ − 1229
+ − 1230 for (1..$ignore) {
+ − 1231 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
+ − 1232 # print "$_ deleted $op\n";
+ − 1233
+ − 1234 while ($op eq 'D') { # repeating this for deletions (D)
+ − 1235 $D_count_1++;
+ − 1236 $op = shift @comp_cigar_1;
+ − 1237 # print "$_ deleted $op\n";
+ − 1238 }
+ − 1239 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
+ − 1240 $I_count_1++;
+ − 1241 }
+ − 1242 }
+ − 1243
+ − 1244 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
+ − 1245 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
+ − 1246
+ − 1247 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
+ − 1248
+ − 1249 for (1..$ignore) {
+ − 1250 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
+ − 1251 while ($op eq 'D') { # repeating this for deletions (D)
+ − 1252 $op = pop @comp_cigar_2;
+ − 1253 }
+ − 1254 }
+ − 1255 # the start position of reads mapping to the reverse strand is being adjusted further below
+ − 1256 } elsif ($strand eq '-') {
+ − 1257
+ − 1258 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
+ − 1259 for (1..$ignore) {
+ − 1260 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
+ − 1261 while ($op eq 'D') { # repeating this for deletions (D)
+ − 1262 $op = pop @comp_cigar_1;
+ − 1263 }
+ − 1264 }
+ − 1265 # the start position of reads mapping to the reverse strand is being adjusted further below
+ − 1266
+ − 1267 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
+ − 1268 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
+ − 1269 my $I_count_2 = 0;
+ − 1270
+ − 1271 for (1..$ignore) {
+ − 1272 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
+ − 1273 # print "$_ deleted $op\n";
+ − 1274
+ − 1275 while ($op eq 'D') { # repeating this for deletions (D)
+ − 1276 $D_count_2++;
+ − 1277 $op = shift @comp_cigar_2;
+ − 1278 # print "$_ deleted $op\n";
+ − 1279 }
+ − 1280 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
+ − 1281 $I_count_2++;
+ − 1282 }
+ − 1283 }
+ − 1284
+ − 1285 $start_read_2 += $ignore + $D_count_2 - $I_count_2;
+ − 1286 # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
+ − 1287
+ − 1288 }
+ − 1289
+ − 1290 ### reconstituting shortened CIGAR string 1
+ − 1291 my $new_cigar_1;
+ − 1292 my $count_1 = 0;
+ − 1293 my $last_op_1;
+ − 1294 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
+ − 1295 foreach my $op (@comp_cigar_1) {
+ − 1296 unless (defined $last_op_1){
+ − 1297 $last_op_1 = $op;
+ − 1298 ++$count_1;
+ − 1299 next;
+ − 1300 }
+ − 1301 if ($last_op_1 eq $op) {
+ − 1302 ++$count_1;
+ − 1303 } else {
+ − 1304 $new_cigar_1 .= "$count_1$last_op_1";
+ − 1305 $last_op_1 = $op;
+ − 1306 $count_1 = 1;
+ − 1307 }
+ − 1308 }
+ − 1309 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
+ − 1310 $cigar_1 = $new_cigar_1;
+ − 1311 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
+ − 1312
+ − 1313 ### reconstituting shortened CIGAR string 2
+ − 1314 my $new_cigar_2;
+ − 1315 my $count_2 = 0;
+ − 1316 my $last_op_2;
+ − 1317 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
+ − 1318 foreach my $op (@comp_cigar_2) {
+ − 1319 unless (defined $last_op_2){
+ − 1320 $last_op_2 = $op;
+ − 1321 ++$count_2;
+ − 1322 next;
+ − 1323 }
+ − 1324 if ($last_op_2 eq $op) {
+ − 1325 ++$count_2;
+ − 1326 } else {
+ − 1327 $new_cigar_2 .= "$count_2$last_op_2";
+ − 1328 $last_op_2 = $op;
+ − 1329 $count_2 = 1;
+ − 1330 }
+ − 1331 }
+ − 1332 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
+ − 1333 $cigar_2 = $new_cigar_2;
+ − 1334 # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n";
+ − 1335
+ − 1336 }
+ − 1337
+ − 1338 if ($strand eq '+') {
+ − 1339 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
+ − 1340 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
+ − 1341 # print "reverse: @comp_cigar_2\n";
+ − 1342
+ − 1343 my $MD_count_1 = 0;
+ − 1344 foreach (@comp_cigar_1) {
+ − 1345 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
+ − 1346 }
+ − 1347
+ − 1348 my $MD_count_2 = 0;
+ − 1349 foreach (@comp_cigar_2) {
+ − 1350 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
+ − 1351 }
+ − 1352
+ − 1353 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
+ − 1354 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
+ − 1355 } else {
+ − 1356 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
+ − 1357
+ − 1358 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
+ − 1359 # print "reverse: @comp_cigar_1\n";
+ − 1360
+ − 1361 my $MD_count_1 = 0;
+ − 1362 foreach (@comp_cigar_1) {
+ − 1363 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
+ − 1364 }
+ − 1365
+ − 1366 $end_read_1 = $start_read_1;
+ − 1367 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
+ − 1368
+ − 1369 }
+ − 1370
+ − 1371 if ($strand eq '+') {
+ − 1372 ## we first pass the first read which is in + orientation on the forward strand
+ − 1373 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1);
+ − 1374
+ − 1375 # we next pass the second read which is in - orientation on the reverse strand
+ − 1376 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
+ − 1377 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2);
+ − 1378 } else {
+ − 1379 ## we first pass the first read which is in - orientation on the reverse strand
+ − 1380 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1);
+ − 1381
+ − 1382 # we next pass the second read which is in + orientation on the forward strand
+ − 1383 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
+ − 1384 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2);
+ − 1385 }
+ − 1386
+ − 1387 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
+ − 1388 }
+ − 1389 }
+ − 1390 }
+ − 1391 } else {
+ − 1392 die "Single-end or paired-end reads not specified properly\n";
+ − 1393 }
+ − 1394
+ − 1395 print "\n\nProcessed $line_count lines from $filename in total\n";
+ − 1396 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
+ − 1397 if ($report) {
+ − 1398 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
+ − 1399 }
+ − 1400 print_splitting_report ();
+ − 1401 }
+ − 1402
+ − 1403
+ − 1404
+ − 1405 sub print_splitting_report{
+ − 1406
+ − 1407 ### Calculating methylation percentages if applicable
+ − 1408
+ − 1409 my $percent_meCpG;
+ − 1410 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
+ − 1411 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
+ − 1412 }
+ − 1413
+ − 1414 my $percent_meCHG;
+ − 1415 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
+ − 1416 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
+ − 1417 }
+ − 1418
+ − 1419 my $percent_meCHH;
+ − 1420 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
+ − 1421 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
+ − 1422 }
+ − 1423
+ − 1424 my $percent_non_CpG_methylation;
+ − 1425 if ($merge_non_CpG){
+ − 1426 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
+ − 1427 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
+ − 1428 }
+ − 1429 }
+ − 1430
+ − 1431 if ($report){
+ − 1432 ### detailed information about Cs analysed
+ − 1433 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
+ − 1434
+ − 1435 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
+ − 1436 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
+ − 1437
+ − 1438 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ − 1439 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ − 1440 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
+ − 1441
+ − 1442 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ − 1443 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ − 1444 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+ − 1445
+ − 1446 ### calculating methylated CpG percentage if applicable
+ − 1447 if ($percent_meCpG){
+ − 1448 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
+ − 1449 }
+ − 1450 else{
+ − 1451 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ − 1452 }
+ − 1453
+ − 1454 ### 2-Context Output
+ − 1455 if ($merge_non_CpG){
+ − 1456 if ($percent_non_CpG_methylation){
+ − 1457 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
+ − 1458 }
+ − 1459 else{
+ − 1460 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
+ − 1461 }
+ − 1462 }
+ − 1463
+ − 1464 ### 3 Context Output
+ − 1465 else{
+ − 1466 ### calculating methylated CHG percentage if applicable
+ − 1467 if ($percent_meCHG){
+ − 1468 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
+ − 1469 }
+ − 1470 else{
+ − 1471 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ − 1472 }
+ − 1473
+ − 1474 ### calculating methylated CHH percentage if applicable
+ − 1475 if ($percent_meCHH){
+ − 1476 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+ − 1477 }
+ − 1478 else{
+ − 1479 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
+ − 1480 }
+ − 1481 }
+ − 1482 }
+ − 1483
+ − 1484 ### detailed information about Cs analysed for on-screen report
+ − 1485 print "Final Cytosine Methylation Report\n",'='x33,"\n";
+ − 1486
+ − 1487 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
+ − 1488 print "Total number of C's analysed:\t$total_number_of_C\n\n";
+ − 1489
+ − 1490 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
+ − 1491 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
+ − 1492 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
+ − 1493
+ − 1494 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+ − 1495 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+ − 1496 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+ − 1497
+ − 1498 ### printing methylated CpG percentage if applicable
+ − 1499 if ($percent_meCpG){
+ − 1500 print "C methylated in CpG context:\t${percent_meCpG}%\n";
+ − 1501 }
+ − 1502 else{
+ − 1503 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
+ − 1504 }
+ − 1505
+ − 1506 ### 2-Context Output
+ − 1507 if ($merge_non_CpG){
+ − 1508 if ($percent_non_CpG_methylation){
+ − 1509 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
+ − 1510 }
+ − 1511 else{
+ − 1512 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
+ − 1513 }
+ − 1514 }
+ − 1515
+ − 1516 ### 3-Context Output
+ − 1517 else{
+ − 1518 ### printing methylated CHG percentage if applicable
+ − 1519 if ($percent_meCHG){
+ − 1520 print "C methylated in CHG context:\t${percent_meCHG}%\n";
+ − 1521 }
+ − 1522 else{
+ − 1523 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
+ − 1524 }
+ − 1525
+ − 1526 ### printing methylated CHH percentage if applicable
+ − 1527 if ($percent_meCHH){
+ − 1528 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+ − 1529 }
+ − 1530 else{
+ − 1531 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
+ − 1532 }
+ − 1533 }
+ − 1534 }
+ − 1535
+ − 1536
+ − 1537
+ − 1538
+ − 1539
+ − 1540 sub print_individual_C_methylation_states_paired_end_files{
+ − 1541
+ − 1542 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_;
+ − 1543 my @methylation_calls = split(//,$meth_call);
+ − 1544
+ − 1545 #################################################################
+ − 1546 ### . for bases not involving cytosines ###
+ − 1547 ### X for methylated C in CHG context (was protected) ###
+ − 1548 ### x for not methylated C in CHG context (was converted) ###
+ − 1549 ### H for methylated C in CHH context (was protected) ###
+ − 1550 ### h for not methylated C in CHH context (was converted) ###
+ − 1551 ### Z for methylated C in CpG context (was protected) ###
+ − 1552 ### z for not methylated C in CpG context (was converted) ###
+ − 1553 #################################################################
+ − 1554
+ − 1555 my $methyl_CHG_count = 0;
+ − 1556 my $methyl_CHH_count = 0;
+ − 1557 my $methyl_CpG_count = 0;
+ − 1558 my $unmethylated_CHG_count = 0;
+ − 1559 my $unmethylated_CHH_count = 0;
+ − 1560 my $unmethylated_CpG_count = 0;
+ − 1561
+ − 1562 my @len;
+ − 1563 my @ops;
+ − 1564 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
+ − 1565 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
+ − 1566 my @comp_cigar;
+ − 1567
+ − 1568 if ($cigar){ # parsing CIGAR string
+ − 1569 @len = split (/\D+/,$cigar); # storing the length per operation
+ − 1570 @ops = split (/\d+/,$cigar); # storing the operation
+ − 1571 shift @ops; # remove the empty first element
+ − 1572 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+ − 1573
+ − 1574 foreach my $index (0..$#len){
+ − 1575 foreach (1..$len[$index]){
+ − 1576 # print "$ops[$index]";
+ − 1577 push @comp_cigar, $ops[$index];
+ − 1578 }
+ − 1579 }
+ − 1580 # warn "\nDetected CIGAR string: $cigar\n";
+ − 1581 # warn "Length of methylation call: ",length $meth_call,"\n";
+ − 1582 # warn "number of operations: ",scalar @ops,"\n";
+ − 1583 # warn "number of length digits: ",scalar @len,"\n\n";
+ − 1584 # print @comp_cigar,"\n";
+ − 1585 # print "$meth_call\n\n";
+ − 1586 # sleep (1);
+ − 1587 }
+ − 1588
+ − 1589
+ − 1590 if ($strand eq '-') {
+ − 1591
+ − 1592 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
+ − 1593 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
+ − 1594 # print "reverse CIGAR string: @comp_cigar\n";
+ − 1595
+ − 1596 ### the start position of paired-end files has already been corrected, see above
+ − 1597 }
+ − 1598
+ − 1599 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
+ − 1600
+ − 1601 if ($merge_non_CpG) {
+ − 1602
+ − 1603 if ($no_overlap) {
+ − 1604
+ − 1605 ### single-file CpG and non-CpG context output
+ − 1606 if ($full) {
+ − 1607 if ($strand eq '+') {
+ − 1608 for my $index (0..$#methylation_calls) {
+ − 1609
+ − 1610 if ($cigar){ # only needed for SAM files
+ − 1611 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 1612 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 1613 $cigar_offset += $cigar_mod;
+ − 1614 $pos_offset += $pos_mod;
+ − 1615 }
+ − 1616
+ − 1617 ### Returning as soon as the methylation calls start overlapping
+ − 1618 if ($start+$index+$pos_offset >= $end_read_1) {
+ − 1619 return;
+ − 1620 }
+ − 1621
+ − 1622 if ($methylation_calls[$index] eq 'X') {
+ − 1623 $counting{total_meCHG_count}++;
+ − 1624 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1625 } elsif ($methylation_calls[$index] eq 'x') {
+ − 1626 $counting{total_unmethylated_CHG_count}++;
+ − 1627 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1628 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 1629 $counting{total_meCpG_count}++;
+ − 1630 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1631 } elsif ($methylation_calls[$index] eq 'z') {
+ − 1632 $counting{total_unmethylated_CpG_count}++;
+ − 1633 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1634 } elsif ($methylation_calls[$index] eq 'H') {
+ − 1635 $counting{total_meCHH_count}++;
+ − 1636 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1637 } elsif ($methylation_calls[$index] eq 'h') {
+ − 1638 $counting{total_unmethylated_CHH_count}++;
+ − 1639 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1640 }
+ − 1641 elsif ($methylation_calls[$index] eq '.'){}
+ − 1642 else{
+ − 1643 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 1644 }
+ − 1645 }
+ − 1646 } elsif ($strand eq '-') {
+ − 1647 for my $index (0..$#methylation_calls) {
+ − 1648
+ − 1649 if ($cigar){ # only needed for SAM files
+ − 1650 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 1651 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 1652 $cigar_offset += $cigar_mod;
+ − 1653 $pos_offset += $pos_mod;
+ − 1654 }
+ − 1655
+ − 1656 ### Returning as soon as the methylation calls start overlapping
+ − 1657 if ($start-$index+$pos_offset <= $end_read_1) {
+ − 1658 return;
+ − 1659 }
+ − 1660
+ − 1661 if ($methylation_calls[$index] eq 'X') {
+ − 1662 $counting{total_meCHG_count}++;
+ − 1663 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1664 } elsif ($methylation_calls[$index] eq 'x') {
+ − 1665 $counting{total_unmethylated_CHG_count}++;
+ − 1666 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1667 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 1668 $counting{total_meCpG_count}++;
+ − 1669 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1670 } elsif ($methylation_calls[$index] eq 'z') {
+ − 1671 $counting{total_unmethylated_CpG_count}++;
+ − 1672 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1673 } elsif ($methylation_calls[$index] eq 'H') {
+ − 1674 $counting{total_meCHH_count}++;
+ − 1675 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1676 } elsif ($methylation_calls[$index] eq 'h') {
+ − 1677 $counting{total_unmethylated_CHH_count}++;
+ − 1678 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1679 }
+ − 1680 elsif ($methylation_calls[$index] eq '.') {}
+ − 1681 else{
+ − 1682 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 1683 }
+ − 1684 }
+ − 1685 } else {
+ − 1686 die "The read orientation was neither + nor -: '$strand'\n";
+ − 1687 }
+ − 1688 }
+ − 1689
+ − 1690 ### strand-specific methylation output
+ − 1691 else {
+ − 1692 if ($strand eq '+') {
+ − 1693 for my $index (0..$#methylation_calls) {
+ − 1694
+ − 1695 if ($cigar){ # only needed for SAM files
+ − 1696 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 1697 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 1698 $cigar_offset += $cigar_mod;
+ − 1699 $pos_offset += $pos_mod;
+ − 1700 }
+ − 1701
+ − 1702 ### Returning as soon as the methylation calls start overlapping
+ − 1703 if ($start+$index+$pos_offset >= $end_read_1) {
+ − 1704 return;
+ − 1705 }
+ − 1706
+ − 1707 if ($methylation_calls[$index] eq 'X') {
+ − 1708 $counting{total_meCHG_count}++;
+ − 1709 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1710 } elsif ($methylation_calls[$index] eq 'x') {
+ − 1711 $counting{total_unmethylated_CHG_count}++;
+ − 1712 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1713 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 1714 $counting{total_meCpG_count}++;
+ − 1715 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1716 } elsif ($methylation_calls[$index] eq 'z') {
+ − 1717 $counting{total_unmethylated_CpG_count}++;
+ − 1718 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1719 } elsif ($methylation_calls[$index] eq 'H') {
+ − 1720 $counting{total_meCHH_count}++;
+ − 1721 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1722 } elsif ($methylation_calls[$index] eq 'h') {
+ − 1723 $counting{total_unmethylated_CHH_count}++;
+ − 1724 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1725 }
+ − 1726 elsif ($methylation_calls[$index] eq '.') {}
+ − 1727 else{
+ − 1728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 1729 }
+ − 1730 }
+ − 1731 } elsif ($strand eq '-') {
+ − 1732 for my $index (0..$#methylation_calls) {
+ − 1733
+ − 1734 if ($cigar){ # only needed for SAM files
+ − 1735 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 1736 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 1737 $cigar_offset += $cigar_mod;
+ − 1738 $pos_offset += $pos_mod;
+ − 1739 }
+ − 1740
+ − 1741 ### Returning as soon as the methylation calls start overlapping
+ − 1742 if ($start-$index+$pos_offset <= $end_read_1) {
+ − 1743 return;
+ − 1744 }
+ − 1745
+ − 1746 if ($methylation_calls[$index] eq 'X') {
+ − 1747 $counting{total_meCHG_count}++;
+ − 1748 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1749 } elsif ($methylation_calls[$index] eq 'x') {
+ − 1750 $counting{total_unmethylated_CHG_count}++;
+ − 1751 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1752 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 1753 $counting{total_meCpG_count}++;
+ − 1754 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1755 } elsif ($methylation_calls[$index] eq 'z') {
+ − 1756 $counting{total_unmethylated_CpG_count}++;
+ − 1757 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1758 } elsif ($methylation_calls[$index] eq 'H') {
+ − 1759 $counting{total_meCHH_count}++;
+ − 1760 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1761 } elsif ($methylation_calls[$index] eq 'h') {
+ − 1762 $counting{total_unmethylated_CHH_count}++;
+ − 1763 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1764 }
+ − 1765 elsif ($methylation_calls[$index] eq '.') {}
+ − 1766 else{
+ − 1767 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 1768 }
+ − 1769 }
+ − 1770 } else {
+ − 1771 die "The strand orientation was neither + nor -: '$strand'/n";
+ − 1772 }
+ − 1773 }
+ − 1774 }
+ − 1775
+ − 1776 ### this is the default paired-end procedure allowing overlaps and using every single C position
+ − 1777 ### Still within the 2-CONTEXT ONLY optional section
+ − 1778 else {
+ − 1779 ### single-file CpG and non-CpG context output
+ − 1780 if ($full) {
+ − 1781 if ($strand eq '+') {
+ − 1782 for my $index (0..$#methylation_calls) {
+ − 1783
+ − 1784 if ($cigar){ # only needed for SAM files
+ − 1785 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 1786 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 1787 $cigar_offset += $cigar_mod;
+ − 1788 $pos_offset += $pos_mod;
+ − 1789 }
+ − 1790
+ − 1791 if ($methylation_calls[$index] eq 'X') {
+ − 1792 $counting{total_meCHG_count}++;
+ − 1793 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1794 } elsif ($methylation_calls[$index] eq 'x') {
+ − 1795 $counting{total_unmethylated_CHG_count}++;
+ − 1796 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1797 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 1798 $counting{total_meCpG_count}++;
+ − 1799 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1800 } elsif ($methylation_calls[$index] eq 'z') {
+ − 1801 $counting{total_unmethylated_CpG_count}++;
+ − 1802 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1803 } elsif ($methylation_calls[$index] eq 'H') {
+ − 1804 $counting{total_meCHH_count}++;
+ − 1805 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1806 } elsif ($methylation_calls[$index] eq 'h') {
+ − 1807 $counting{total_unmethylated_CHH_count}++;
+ − 1808 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1809 }
+ − 1810 elsif ($methylation_calls[$index] eq '.') {}
+ − 1811 else{
+ − 1812 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 1813 }
+ − 1814 }
+ − 1815 } elsif ($strand eq '-') {
+ − 1816 for my $index (0..$#methylation_calls) {
+ − 1817
+ − 1818 if ($cigar){ # only needed for SAM files
+ − 1819 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 1820 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 1821 $cigar_offset += $cigar_mod;
+ − 1822 $pos_offset += $pos_mod;
+ − 1823 }
+ − 1824
+ − 1825 if ($methylation_calls[$index] eq 'X') {
+ − 1826 $counting{total_meCHG_count}++;
+ − 1827 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1828 } elsif ($methylation_calls[$index] eq 'x') {
+ − 1829 $counting{total_unmethylated_CHG_count}++;
+ − 1830 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1831 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 1832 $counting{total_meCpG_count}++;
+ − 1833 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1834 } elsif ($methylation_calls[$index] eq 'z') {
+ − 1835 $counting{total_unmethylated_CpG_count}++;
+ − 1836 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1837 } elsif ($methylation_calls[$index] eq 'H') {
+ − 1838 $counting{total_meCHH_count}++;
+ − 1839 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1840 } elsif ($methylation_calls[$index] eq 'h') {
+ − 1841 $counting{total_unmethylated_CHH_count}++;
+ − 1842 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1843 }
+ − 1844 elsif ($methylation_calls[$index] eq '.') {}
+ − 1845 else{
+ − 1846 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 1847 }
+ − 1848 }
+ − 1849 } else {
+ − 1850 die "The strand orientation as neither + nor -: '$strand'\n";
+ − 1851 }
+ − 1852 }
+ − 1853
+ − 1854 ### strand-specific methylation output
+ − 1855 ### still within the 2-CONTEXT optional section
+ − 1856 else {
+ − 1857 if ($strand eq '+') {
+ − 1858 for my $index (0..$#methylation_calls) {
+ − 1859
+ − 1860 if ($cigar){ # only needed for SAM files
+ − 1861 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 1862 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 1863 $cigar_offset += $cigar_mod;
+ − 1864 $pos_offset += $pos_mod;
+ − 1865 }
+ − 1866
+ − 1867 if ($methylation_calls[$index] eq 'X') {
+ − 1868 $counting{total_meCHG_count}++;
+ − 1869 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1870 } elsif ($methylation_calls[$index] eq 'x') {
+ − 1871 $counting{total_unmethylated_CHG_count}++;
+ − 1872 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1873 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 1874 $counting{total_meCpG_count}++;
+ − 1875 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1876 } elsif ($methylation_calls[$index] eq 'z') {
+ − 1877 $counting{total_unmethylated_CpG_count}++;
+ − 1878 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1879 } elsif ($methylation_calls[$index] eq 'H') {
+ − 1880 $counting{total_meCHH_count}++;
+ − 1881 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1882 } elsif ($methylation_calls[$index] eq 'h') {
+ − 1883 $counting{total_unmethylated_CHH_count}++;
+ − 1884 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1885 }
+ − 1886 elsif ($methylation_calls[$index] eq '.') {}
+ − 1887 else{
+ − 1888 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 1889 }
+ − 1890 }
+ − 1891 } elsif ($strand eq '-') {
+ − 1892 for my $index (0..$#methylation_calls) {
+ − 1893
+ − 1894 if ($cigar){ # only needed for SAM files
+ − 1895 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 1896 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 1897 $cigar_offset += $cigar_mod;
+ − 1898 $pos_offset += $pos_mod;
+ − 1899 }
+ − 1900
+ − 1901 if ($methylation_calls[$index] eq 'X') {
+ − 1902 $counting{total_meCHG_count}++;
+ − 1903 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1904 } elsif ($methylation_calls[$index] eq 'x') {
+ − 1905 $counting{total_unmethylated_CHG_count}++;
+ − 1906 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1907 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 1908 $counting{total_meCpG_count}++;
+ − 1909 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1910 } elsif ($methylation_calls[$index] eq 'z') {
+ − 1911 $counting{total_unmethylated_CpG_count}++;
+ − 1912 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1913 } elsif ($methylation_calls[$index] eq 'H') {
+ − 1914 $counting{total_meCHH_count}++;
+ − 1915 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1916 } elsif ($methylation_calls[$index] eq 'h') {
+ − 1917 $counting{total_unmethylated_CHH_count}++;
+ − 1918 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1919 }
+ − 1920 elsif ($methylation_calls[$index] eq '.') {}
+ − 1921 else{
+ − 1922 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 1923 }
+ − 1924 }
+ − 1925 } else {
+ − 1926 die "The strand orientation as neither + nor -: '$strand'\n";
+ − 1927 }
+ − 1928 }
+ − 1929 }
+ − 1930 }
+ − 1931
+ − 1932 ############################################
+ − 1933 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
+ − 1934 ############################################
+ − 1935
+ − 1936 elsif ($no_overlap) {
+ − 1937 ### single-file CpG, CHG and CHH context output
+ − 1938 if ($full) {
+ − 1939 if ($strand eq '+') {
+ − 1940 for my $index (0..$#methylation_calls) {
+ − 1941
+ − 1942 if ($cigar){ # only needed for SAM files
+ − 1943 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 1944 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 1945 $cigar_offset += $cigar_mod;
+ − 1946 $pos_offset += $pos_mod;
+ − 1947 }
+ − 1948
+ − 1949 ### Returning as soon as the methylation calls start overlapping
+ − 1950 if ($start+$index+$pos_offset >= $end_read_1) {
+ − 1951 return;
+ − 1952 }
+ − 1953
+ − 1954 if ($methylation_calls[$index] eq 'X') {
+ − 1955 $counting{total_meCHG_count}++;
+ − 1956 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1957 } elsif ($methylation_calls[$index] eq 'x') {
+ − 1958 $counting{total_unmethylated_CHG_count}++;
+ − 1959 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1960 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 1961 $counting{total_meCpG_count}++;
+ − 1962 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1963 } elsif ($methylation_calls[$index] eq 'z') {
+ − 1964 $counting{total_unmethylated_CpG_count}++;
+ − 1965 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1966 } elsif ($methylation_calls[$index] eq 'H') {
+ − 1967 $counting{total_meCHH_count}++;
+ − 1968 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1969 } elsif ($methylation_calls[$index] eq 'h') {
+ − 1970 $counting{total_unmethylated_CHH_count}++;
+ − 1971 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1972 }
+ − 1973 elsif ($methylation_calls[$index] eq '.') {}
+ − 1974 else{
+ − 1975 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 1976 }
+ − 1977 }
+ − 1978 } elsif ($strand eq '-') {
+ − 1979 for my $index (0..$#methylation_calls) {
+ − 1980
+ − 1981 if ($cigar){ # only needed for SAM files
+ − 1982 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 1983 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 1984 $cigar_offset += $cigar_mod;
+ − 1985 $pos_offset += $pos_mod;
+ − 1986 }
+ − 1987
+ − 1988 ### Returning as soon as the methylation calls start overlapping
+ − 1989 if ($start-$index+$pos_offset <= $end_read_1) {
+ − 1990 return;
+ − 1991 }
+ − 1992
+ − 1993 if ($methylation_calls[$index] eq 'X') {
+ − 1994 $counting{total_meCHG_count}++;
+ − 1995 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1996 } elsif ($methylation_calls[$index] eq 'x') {
+ − 1997 $counting{total_unmethylated_CHG_count}++;
+ − 1998 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 1999 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 2000 $counting{total_meCpG_count}++;
+ − 2001 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2002 } elsif ($methylation_calls[$index] eq 'z') {
+ − 2003 $counting{total_unmethylated_CpG_count}++;
+ − 2004 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2005 } elsif ($methylation_calls[$index] eq 'H') {
+ − 2006 $counting{total_meCHH_count}++;
+ − 2007 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2008 } elsif ($methylation_calls[$index] eq 'h') {
+ − 2009 $counting{total_unmethylated_CHH_count}++;
+ − 2010 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2011 }
+ − 2012 elsif ($methylation_calls[$index] eq '.') {}
+ − 2013 else{
+ − 2014 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2015 }
+ − 2016 }
+ − 2017 } else {
+ − 2018 die "The strand orientation as neither + nor -: '$strand'\n";
+ − 2019 }
+ − 2020 }
+ − 2021
+ − 2022 ### strand-specific methylation output
+ − 2023 else {
+ − 2024 if ($strand eq '+') {
+ − 2025 for my $index (0..$#methylation_calls) {
+ − 2026
+ − 2027 if ($cigar){ # only needed for SAM files
+ − 2028 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2029 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 2030 $cigar_offset += $cigar_mod;
+ − 2031 $pos_offset += $pos_mod;
+ − 2032 }
+ − 2033
+ − 2034 ### Returning as soon as the methylation calls start overlapping
+ − 2035 if ($start+$index+$pos_offset >= $end_read_1) {
+ − 2036 return;
+ − 2037 }
+ − 2038
+ − 2039 if ($methylation_calls[$index] eq 'X') {
+ − 2040 $counting{total_meCHG_count}++;
+ − 2041 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2042 } elsif ($methylation_calls[$index] eq 'x') {
+ − 2043 $counting{total_unmethylated_CHG_count}++;
+ − 2044 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2045 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 2046 $counting{total_meCpG_count}++;
+ − 2047 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2048 } elsif ($methylation_calls[$index] eq 'z') {
+ − 2049 $counting{total_unmethylated_CpG_count}++;
+ − 2050 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2051 } elsif ($methylation_calls[$index] eq 'H') {
+ − 2052 $counting{total_meCHH_count}++;
+ − 2053 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2054 } elsif ($methylation_calls[$index] eq 'h') {
+ − 2055 $counting{total_unmethylated_CHH_count}++;
+ − 2056 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2057 }
+ − 2058 elsif ($methylation_calls[$index] eq '.') {}
+ − 2059 else{
+ − 2060 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2061 }
+ − 2062 }
+ − 2063 } elsif ($strand eq '-') {
+ − 2064 for my $index (0..$#methylation_calls) {
+ − 2065
+ − 2066 if ($cigar){ # only needed for SAM files
+ − 2067 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 2068 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2069 $cigar_offset += $cigar_mod;
+ − 2070 $pos_offset += $pos_mod;
+ − 2071 }
+ − 2072
+ − 2073 ### Returning as soon as the methylation calls start overlapping
+ − 2074 if ($start-$index+$pos_offset <= $end_read_1) {
+ − 2075 return;
+ − 2076 }
+ − 2077
+ − 2078 if ($methylation_calls[$index] eq 'X') {
+ − 2079 $counting{total_meCHG_count}++;
+ − 2080 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2081 } elsif ($methylation_calls[$index] eq 'x') {
+ − 2082 $counting{total_unmethylated_CHG_count}++;
+ − 2083 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2084 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 2085 $counting{total_meCpG_count}++;
+ − 2086 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2087 } elsif ($methylation_calls[$index] eq 'z') {
+ − 2088 $counting{total_unmethylated_CpG_count}++;
+ − 2089 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2090 } elsif ($methylation_calls[$index] eq 'H') {
+ − 2091 $counting{total_meCHH_count}++;
+ − 2092 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2093 } elsif ($methylation_calls[$index] eq 'h') {
+ − 2094 $counting{total_unmethylated_CHH_count}++;
+ − 2095 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2096 }
+ − 2097 elsif ($methylation_calls[$index] eq '.') {}
+ − 2098 else{
+ − 2099 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2100 }
+ − 2101 }
+ − 2102 } else {
+ − 2103 die "The strand orientation as neither + nor -: '$strand'\n";
+ − 2104 }
+ − 2105 }
+ − 2106 }
+ − 2107
+ − 2108 ### this is the default paired-end procedure allowing overlaps and using every single C position
+ − 2109 else {
+ − 2110 ### single-file CpG, CHG and CHH context output
+ − 2111 if ($full) {
+ − 2112 if ($strand eq '+') {
+ − 2113 for my $index (0..$#methylation_calls) {
+ − 2114
+ − 2115 if ($cigar){ # only needed for SAM files
+ − 2116 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2117 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 2118 $cigar_offset += $cigar_mod;
+ − 2119 $pos_offset += $pos_mod;
+ − 2120 }
+ − 2121
+ − 2122 if ($methylation_calls[$index] eq 'X') {
+ − 2123 $counting{total_meCHG_count}++;
+ − 2124 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2125 } elsif ($methylation_calls[$index] eq 'x') {
+ − 2126 $counting{total_unmethylated_CHG_count}++;
+ − 2127 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2128 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 2129 $counting{total_meCpG_count}++;
+ − 2130 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2131 } elsif ($methylation_calls[$index] eq 'z') {
+ − 2132 $counting{total_unmethylated_CpG_count}++;
+ − 2133 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2134 } elsif ($methylation_calls[$index] eq 'H') {
+ − 2135 $counting{total_meCHH_count}++;
+ − 2136 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2137 } elsif ($methylation_calls[$index] eq 'h') {
+ − 2138 $counting{total_unmethylated_CHH_count}++;
+ − 2139 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2140 }
+ − 2141 elsif ($methylation_calls[$index] eq '.') {}
+ − 2142 else{
+ − 2143 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2144 }
+ − 2145 }
+ − 2146 } elsif ($strand eq '-') {
+ − 2147 for my $index (0..$#methylation_calls) {
+ − 2148
+ − 2149 if ($cigar){ # only needed for SAM files
+ − 2150 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 2151 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2152 $cigar_offset += $cigar_mod;
+ − 2153 $pos_offset += $pos_mod;
+ − 2154 }
+ − 2155
+ − 2156 if ($methylation_calls[$index] eq 'X') {
+ − 2157 $counting{total_meCHG_count}++;
+ − 2158 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2159 } elsif ($methylation_calls[$index] eq 'x') {
+ − 2160 $counting{total_unmethylated_CHG_count}++;
+ − 2161 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2162 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 2163 $counting{total_meCpG_count}++;
+ − 2164 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2165 } elsif ($methylation_calls[$index] eq 'z') {
+ − 2166 $counting{total_unmethylated_CpG_count}++;
+ − 2167 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2168 } elsif ($methylation_calls[$index] eq 'H') {
+ − 2169 $counting{total_meCHH_count}++;
+ − 2170 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2171 } elsif ($methylation_calls[$index] eq 'h') {
+ − 2172 $counting{total_unmethylated_CHH_count}++;
+ − 2173 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2174 }
+ − 2175 elsif ($methylation_calls[$index] eq '.') {}
+ − 2176 else{
+ − 2177 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2178 }
+ − 2179 }
+ − 2180 } else {
+ − 2181 die "The strand orientation as neither + nor -: '$strand'\n";
+ − 2182 }
+ − 2183 }
+ − 2184
+ − 2185 ### strand-specific methylation output
+ − 2186 else {
+ − 2187 if ($strand eq '+') {
+ − 2188 for my $index (0..$#methylation_calls) {
+ − 2189
+ − 2190 if ($cigar){ # only needed for SAM files
+ − 2191 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2192 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 2193 $cigar_offset += $cigar_mod;
+ − 2194 $pos_offset += $pos_mod;
+ − 2195 }
+ − 2196
+ − 2197 if ($methylation_calls[$index] eq 'X') {
+ − 2198 $counting{total_meCHG_count}++;
+ − 2199 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2200 } elsif ($methylation_calls[$index] eq 'x') {
+ − 2201 $counting{total_unmethylated_CHG_count}++;
+ − 2202 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2203 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 2204 $counting{total_meCpG_count}++;
+ − 2205 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2206 } elsif ($methylation_calls[$index] eq 'z') {
+ − 2207 $counting{total_unmethylated_CpG_count}++;
+ − 2208 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2209 } elsif ($methylation_calls[$index] eq 'H') {
+ − 2210 $counting{total_meCHH_count}++;
+ − 2211 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2212 } elsif ($methylation_calls[$index] eq 'h') {
+ − 2213 $counting{total_unmethylated_CHH_count}++;
+ − 2214 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2215 }
+ − 2216 elsif ($methylation_calls[$index] eq '.') {}
+ − 2217 else{
+ − 2218 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2219 }
+ − 2220 }
+ − 2221 } elsif ($strand eq '-') {
+ − 2222 for my $index (0..$#methylation_calls) {
+ − 2223
+ − 2224 if ($cigar){ # only needed for SAM files
+ − 2225 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 2226 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2227 $cigar_offset += $cigar_mod;
+ − 2228 $pos_offset += $pos_mod;
+ − 2229 }
+ − 2230
+ − 2231 if ($methylation_calls[$index] eq 'X') {
+ − 2232 $counting{total_meCHG_count}++;
+ − 2233 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2234 } elsif ($methylation_calls[$index] eq 'x') {
+ − 2235 $counting{total_unmethylated_CHG_count}++;
+ − 2236 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2237 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 2238 $counting{total_meCpG_count}++;
+ − 2239 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2240 } elsif ($methylation_calls[$index] eq 'z') {
+ − 2241 $counting{total_unmethylated_CpG_count}++;
+ − 2242 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2243 } elsif ($methylation_calls[$index] eq 'H') {
+ − 2244 $counting{total_meCHH_count}++;
+ − 2245 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2246 } elsif ($methylation_calls[$index] eq 'h') {
+ − 2247 $counting{total_unmethylated_CHH_count}++;
+ − 2248 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2249 }
+ − 2250 elsif ($methylation_calls[$index] eq '.') {}
+ − 2251 else{
+ − 2252 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2253 }
+ − 2254 }
+ − 2255 } else {
+ − 2256 die "The strand orientation as neither + nor -: '$strand'\n";
+ − 2257 }
+ − 2258 }
+ − 2259 }
+ − 2260 }
+ − 2261
+ − 2262 sub check_cigar_string {
+ − 2263 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
+ − 2264 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
+ − 2265 my ($new_cigar_offset,$new_pos_offset) = (0,0);
+ − 2266
+ − 2267 if ($strand eq '+') {
+ − 2268 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
+ − 2269
+ − 2270 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
+ − 2271 # warn "position needs no adjustment\n";
+ − 2272 }
+ − 2273
+ − 2274 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
+ − 2275 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
+ − 2276 # warn "adjusted genomic position by -1 bp (insertion)\n";
+ − 2277 }
+ − 2278
+ − 2279 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
+ − 2280 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
+ − 2281 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
+ − 2282 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
+ − 2283
+ − 2284 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
+ − 2285 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
+ − 2286 # warn "position needs no adjustment\n";
+ − 2287 last;
+ − 2288 }
+ − 2289 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
+ − 2290 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
+ − 2291 # warn "adjusted genomic position by another -1 bp (insertion)\n";
+ − 2292 last;
+ − 2293 }
+ − 2294 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
+ − 2295 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
+ − 2296 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
+ − 2297 # warn "adjusted genomic position by another +1 bp (deletion)\n";
+ − 2298 }
+ − 2299 else{
+ − 2300 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
+ − 2301 }
+ − 2302 }
+ − 2303 }
+ − 2304 else{
+ − 2305 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
+ − 2306 }
+ − 2307 }
+ − 2308
+ − 2309 elsif ($strand eq '-') {
+ − 2310 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
+ − 2311
+ − 2312 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
+ − 2313 # warn "position needs no adjustment\n";
+ − 2314 }
+ − 2315
+ − 2316 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
+ − 2317 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
+ − 2318 # warn "adjusted genomic position by +1 bp (insertion)\n";
+ − 2319 }
+ − 2320
+ − 2321 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
+ − 2322 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
+ − 2323 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
+ − 2324 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
+ − 2325
+ − 2326 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
+ − 2327 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
+ − 2328 # warn "Found new 'M' operation; position needs no adjustment\n";
+ − 2329 last;
+ − 2330 }
+ − 2331 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
+ − 2332 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
+ − 2333 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
+ − 2334 last;
+ − 2335 }
+ − 2336 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
+ − 2337 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
+ − 2338 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
+ − 2339 # warn "adjusted genomic position by another -1 bp (deletion)\n";
+ − 2340 }
+ − 2341 else{
+ − 2342 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
+ − 2343 }
+ − 2344 }
+ − 2345 }
+ − 2346 else{
+ − 2347 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
+ − 2348 }
+ − 2349 }
+ − 2350 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
+ − 2351 return ($new_cigar_offset,$new_pos_offset);
+ − 2352 }
+ − 2353
+ − 2354 sub print_individual_C_methylation_states_single_end{
+ − 2355
+ − 2356 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
+ − 2357 my @methylation_calls = split(//,$meth_call);
+ − 2358
+ − 2359 #################################################################
+ − 2360 ### . for bases not involving cytosines ###
+ − 2361 ### X for methylated C in CHG context (was protected) ###
+ − 2362 ### x for not methylated C in CHG context (was converted) ###
+ − 2363 ### H for methylated C in CHH context (was protected) ###
+ − 2364 ### h for not methylated C in CHH context (was converted) ###
+ − 2365 ### Z for methylated C in CpG context (was protected) ###
+ − 2366 ### z for not methylated C in CpG context (was converted) ###
+ − 2367 #################################################################
+ − 2368
+ − 2369 my $methyl_CHG_count = 0;
+ − 2370 my $methyl_CHH_count = 0;
+ − 2371 my $methyl_CpG_count = 0;
+ − 2372 my $unmethylated_CHG_count = 0;
+ − 2373 my $unmethylated_CHH_count = 0;
+ − 2374 my $unmethylated_CpG_count = 0;
+ − 2375
+ − 2376 my @len;
+ − 2377 my @ops;
+ − 2378 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
+ − 2379 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
+ − 2380
+ − 2381 my @comp_cigar;
+ − 2382
+ − 2383 if ($cigar){ # parsing CIGAR string
+ − 2384 @len = split (/\D+/,$cigar); # storing the length per operation
+ − 2385 @ops = split (/\d+/,$cigar); # storing the operation
+ − 2386 shift @ops; # remove the empty first element
+ − 2387 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+ − 2388
+ − 2389 foreach my $index (0..$#len){
+ − 2390 foreach (1..$len[$index]){
+ − 2391 # print "$ops[$index]";
+ − 2392 push @comp_cigar, $ops[$index];
+ − 2393 }
+ − 2394 }
+ − 2395 # warn "\nDetected CIGAR string: $cigar\n";
+ − 2396 # warn "Length of methylation call: ",length $meth_call,"\n";
+ − 2397 # warn "number of operations: ",scalar @ops,"\n";
+ − 2398 # warn "number of length digits: ",scalar @len,"\n\n";
+ − 2399 # print @comp_cigar,"\n";
+ − 2400 # print "$meth_call\n\n";
+ − 2401 # sleep (1);
+ − 2402 }
+ − 2403
+ − 2404 ### adjusting the start position for all reads mapping to the reverse strand
+ − 2405 if ($strand eq '-') {
+ − 2406
+ − 2407 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
+ − 2408 # print @comp_cigar,"\n";
+ − 2409
+ − 2410 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
+ − 2411
+ − 2412 if ($cigar){ ### SAM format
+ − 2413 my $MD_count = 0;
+ − 2414 foreach (@comp_cigar){
+ − 2415 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
+ − 2416 }
+ − 2417 $start += $MD_count - 1;
+ − 2418 }
+ − 2419 else{ ### vanilla format
+ − 2420 $start += length($meth_call)-1;
+ − 2421 }
+ − 2422 }
+ − 2423 }
+ − 2424
+ − 2425 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
+ − 2426
+ − 2427 ### single-file CpG and other-context output
+ − 2428 if ($full and $merge_non_CpG) {
+ − 2429 if ($strand eq '+') {
+ − 2430 for my $index (0..$#methylation_calls) {
+ − 2431
+ − 2432 if ($cigar){ # only needed for SAM files
+ − 2433 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2434 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
+ − 2435 $cigar_offset += $cigar_mod;
+ − 2436 $pos_offset += $pos_mod;
+ − 2437 }
+ − 2438
+ − 2439 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 2440 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 2441 if ($methylation_calls[$index] eq 'X') {
+ − 2442 $counting{total_meCHG_count}++;
+ − 2443 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2444 }
+ − 2445 elsif ($methylation_calls[$index] eq 'x') {
+ − 2446 $counting{total_unmethylated_CHG_count}++;
+ − 2447 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2448 }
+ − 2449 elsif ($methylation_calls[$index] eq 'Z') {
+ − 2450 $counting{total_meCpG_count}++;
+ − 2451 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2452 }
+ − 2453 elsif ($methylation_calls[$index] eq 'z') {
+ − 2454 $counting{total_unmethylated_CpG_count}++;
+ − 2455 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2456 }
+ − 2457 elsif ($methylation_calls[$index] eq 'H') {
+ − 2458 $counting{total_meCHH_count}++;
+ − 2459 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2460 }
+ − 2461 elsif ($methylation_calls[$index] eq 'h') {
+ − 2462 $counting{total_unmethylated_CHH_count}++;
+ − 2463 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2464 }
+ − 2465 elsif ($methylation_calls[$index] eq '.') {
+ − 2466 }
+ − 2467 else{
+ − 2468 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2469 }
+ − 2470 }
+ − 2471 }
+ − 2472 elsif ($strand eq '-') {
+ − 2473
+ − 2474 for my $index (0..$#methylation_calls) {
+ − 2475 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 2476 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 2477
+ − 2478 if ($cigar){ # only needed for SAM files
+ − 2479 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
+ − 2480 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2481 $cigar_offset += $cigar_mod;
+ − 2482 $pos_offset += $pos_mod;
+ − 2483 }
+ − 2484
+ − 2485 if ($methylation_calls[$index] eq 'X') {
+ − 2486 $counting{total_meCHG_count}++;
+ − 2487 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2488 }
+ − 2489 elsif ($methylation_calls[$index] eq 'x') {
+ − 2490 $counting{total_unmethylated_CHG_count}++;
+ − 2491 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2492 }
+ − 2493 elsif ($methylation_calls[$index] eq 'Z') {
+ − 2494 $counting{total_meCpG_count}++;
+ − 2495 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2496 }
+ − 2497 elsif ($methylation_calls[$index] eq 'z') {
+ − 2498 $counting{total_unmethylated_CpG_count}++;
+ − 2499 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2500 }
+ − 2501 elsif ($methylation_calls[$index] eq 'H') {
+ − 2502 $counting{total_meCHH_count}++;
+ − 2503 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2504 }
+ − 2505 elsif ($methylation_calls[$index] eq 'h') {
+ − 2506 $counting{total_unmethylated_CHH_count}++;
+ − 2507 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2508 }
+ − 2509 elsif ($methylation_calls[$index] eq '.'){
+ − 2510 }
+ − 2511 else{
+ − 2512 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2513 }
+ − 2514 }
+ − 2515 }
+ − 2516 else {
+ − 2517 die "The strand information was neither + nor -: $strand\n";
+ − 2518 }
+ − 2519 }
+ − 2520
+ − 2521 ### strand-specific methylation output
+ − 2522 elsif ($merge_non_CpG) {
+ − 2523 if ($strand eq '+') {
+ − 2524 for my $index (0..$#methylation_calls) {
+ − 2525 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 2526 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 2527
+ − 2528 if ($cigar){ # only needed for SAM files
+ − 2529 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2530 $cigar_offset += $cigar_mod;
+ − 2531 $pos_offset += $pos_mod;
+ − 2532 }
+ − 2533
+ − 2534 if ($methylation_calls[$index] eq 'X') {
+ − 2535 $counting{total_meCHG_count}++;
+ − 2536 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2537 }
+ − 2538 elsif ($methylation_calls[$index] eq 'x') {
+ − 2539 $counting{total_unmethylated_CHG_count}++;
+ − 2540 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2541 }
+ − 2542 elsif ($methylation_calls[$index] eq 'Z') {
+ − 2543 $counting{total_meCpG_count}++;
+ − 2544 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2545 }
+ − 2546 elsif ($methylation_calls[$index] eq 'z') {
+ − 2547 $counting{total_unmethylated_CpG_count}++;
+ − 2548 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2549 }
+ − 2550 elsif ($methylation_calls[$index] eq 'H') {
+ − 2551 $counting{total_meCHH_count}++;
+ − 2552 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2553 }
+ − 2554 elsif ($methylation_calls[$index] eq 'h') {
+ − 2555 $counting{total_unmethylated_CHH_count}++;
+ − 2556 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2557 }
+ − 2558 elsif ($methylation_calls[$index] eq '.') {
+ − 2559 }
+ − 2560 else{
+ − 2561 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2562 }
+ − 2563 }
+ − 2564 }
+ − 2565 elsif ($strand eq '-') {
+ − 2566
+ − 2567 for my $index (0..$#methylation_calls) {
+ − 2568 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 2569 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 2570
+ − 2571 if ($cigar){ # only needed for SAM files
+ − 2572 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2573 $cigar_offset += $cigar_mod;
+ − 2574 $pos_offset += $pos_mod;
+ − 2575 }
+ − 2576
+ − 2577 if ($methylation_calls[$index] eq 'X') {
+ − 2578 $counting{total_meCHG_count}++;
+ − 2579 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2580 }
+ − 2581 elsif ($methylation_calls[$index] eq 'x') {
+ − 2582 $counting{total_unmethylated_CHG_count}++;
+ − 2583 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2584 }
+ − 2585 elsif ($methylation_calls[$index] eq 'Z') {
+ − 2586 $counting{total_meCpG_count}++;
+ − 2587 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2588 }
+ − 2589 elsif ($methylation_calls[$index] eq 'z') {
+ − 2590 $counting{total_unmethylated_CpG_count}++;
+ − 2591 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2592 }
+ − 2593 elsif ($methylation_calls[$index] eq 'H') {
+ − 2594 $counting{total_meCHH_count}++;
+ − 2595 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2596 }
+ − 2597 elsif ($methylation_calls[$index] eq 'h') {
+ − 2598 $counting{total_unmethylated_CHH_count}++;
+ − 2599 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2600 }
+ − 2601 elsif ($methylation_calls[$index] eq '.') {
+ − 2602 }
+ − 2603 else{
+ − 2604 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2605 }
+ − 2606 }
+ − 2607 }
+ − 2608 else {
+ − 2609 die "The strand information was neither + nor -: $strand\n";
+ − 2610 }
+ − 2611 }
+ − 2612
+ − 2613 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
+ − 2614
+ − 2615 elsif ($full) {
+ − 2616 if ($strand eq '+') {
+ − 2617 for my $index (0..$#methylation_calls) {
+ − 2618 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 2619 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 2620
+ − 2621 if ($cigar){ # only needed for SAM files
+ − 2622 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2623 $cigar_offset += $cigar_mod;
+ − 2624 $pos_offset += $pos_mod;
+ − 2625 }
+ − 2626
+ − 2627 if ($methylation_calls[$index] eq 'X') {
+ − 2628 $counting{total_meCHG_count}++;
+ − 2629 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2630 } elsif ($methylation_calls[$index] eq 'x') {
+ − 2631 $counting{total_unmethylated_CHG_count}++;
+ − 2632 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2633 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 2634 $counting{total_meCpG_count}++;
+ − 2635 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2636 } elsif ($methylation_calls[$index] eq 'z') {
+ − 2637 $counting{total_unmethylated_CpG_count}++;
+ − 2638 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2639 } elsif ($methylation_calls[$index] eq 'H') {
+ − 2640 $counting{total_meCHH_count}++;
+ − 2641 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2642 } elsif ($methylation_calls[$index] eq 'h') {
+ − 2643 $counting{total_unmethylated_CHH_count}++;
+ − 2644 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2645 }
+ − 2646 elsif ($methylation_calls[$index] eq '.') {}
+ − 2647 else{
+ − 2648 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2649 }
+ − 2650 }
+ − 2651 }
+ − 2652 elsif ($strand eq '-') {
+ − 2653
+ − 2654 for my $index (0..$#methylation_calls) {
+ − 2655 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 2656 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 2657
+ − 2658 if ($cigar){ # only needed for SAM files
+ − 2659 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2660 $cigar_offset += $cigar_mod;
+ − 2661 $pos_offset += $pos_mod;
+ − 2662 }
+ − 2663
+ − 2664 if ($methylation_calls[$index] eq 'X') {
+ − 2665 $counting{total_meCHG_count}++;
+ − 2666 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2667 } elsif ($methylation_calls[$index] eq 'x') {
+ − 2668 $counting{total_unmethylated_CHG_count}++;
+ − 2669 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2670 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 2671 $counting{total_meCpG_count}++;
+ − 2672 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2673 } elsif ($methylation_calls[$index] eq 'z') {
+ − 2674 $counting{total_unmethylated_CpG_count}++;
+ − 2675 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2676 } elsif ($methylation_calls[$index] eq 'H') {
+ − 2677 $counting{total_meCHH_count}++;
+ − 2678 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2679 } elsif ($methylation_calls[$index] eq 'h') {
+ − 2680 $counting{total_unmethylated_CHH_count}++;
+ − 2681 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2682 }
+ − 2683 elsif ($methylation_calls[$index] eq '.') {}
+ − 2684 else{
+ − 2685 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2686 }
+ − 2687 }
+ − 2688 }
+ − 2689 else {
+ − 2690 die "The read had a strand orientation which was neither + nor -: $strand\n";
+ − 2691 }
+ − 2692 }
+ − 2693
+ − 2694 ### strand-specific methylation output
+ − 2695 else {
+ − 2696 if ($strand eq '+') {
+ − 2697 for my $index (0..$#methylation_calls) {
+ − 2698 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 2699 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 2700
+ − 2701 if ($cigar){ # only needed for SAM files
+ − 2702 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2703 $cigar_offset += $cigar_mod;
+ − 2704 $pos_offset += $pos_mod;
+ − 2705 }
+ − 2706
+ − 2707 if ($methylation_calls[$index] eq 'X') {
+ − 2708 $counting{total_meCHG_count}++;
+ − 2709 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2710 } elsif ($methylation_calls[$index] eq 'x') {
+ − 2711 $counting{total_unmethylated_CHG_count}++;
+ − 2712 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2713 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 2714 $counting{total_meCpG_count}++;
+ − 2715 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2716 } elsif ($methylation_calls[$index] eq 'z') {
+ − 2717 $counting{total_unmethylated_CpG_count}++;
+ − 2718 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2719 } elsif ($methylation_calls[$index] eq 'H') {
+ − 2720 $counting{total_meCHH_count}++;
+ − 2721 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2722 } elsif ($methylation_calls[$index] eq 'h') {
+ − 2723 $counting{total_unmethylated_CHH_count}++;
+ − 2724 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2725 }
+ − 2726 elsif ($methylation_calls[$index] eq '.') {}
+ − 2727 else{
+ − 2728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2729 }
+ − 2730 }
+ − 2731 }
+ − 2732 elsif ($strand eq '-') {
+ − 2733
+ − 2734 for my $index (0..$#methylation_calls) {
+ − 2735 ### methylated Cs (any context) will receive a forward (+) orientation
+ − 2736 ### not methylated Cs (any context) will receive a reverse (-) orientation
+ − 2737
+ − 2738 if ($cigar){ # only needed for SAM files
+ − 2739 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
+ − 2740 $cigar_offset += $cigar_mod;
+ − 2741 $pos_offset += $pos_mod;
+ − 2742 }
+ − 2743
+ − 2744 if ($methylation_calls[$index] eq 'X') {
+ − 2745 $counting{total_meCHG_count}++;
+ − 2746 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2747 } elsif ($methylation_calls[$index] eq 'x') {
+ − 2748 $counting{total_unmethylated_CHG_count}++;
+ − 2749 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2750 } elsif ($methylation_calls[$index] eq 'Z') {
+ − 2751 $counting{total_meCpG_count}++;
+ − 2752 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2753 } elsif ($methylation_calls[$index] eq 'z') {
+ − 2754 $counting{total_unmethylated_CpG_count}++;
+ − 2755 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2756 } elsif ($methylation_calls[$index] eq 'H') {
+ − 2757 $counting{total_meCHH_count}++;
+ − 2758 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2759 } elsif ($methylation_calls[$index] eq 'h') {
+ − 2760 $counting{total_unmethylated_CHH_count}++;
+ − 2761 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
+ − 2762 }
+ − 2763 elsif ($methylation_calls[$index] eq '.') {}
+ − 2764 else{
+ − 2765 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
+ − 2766 }
+ − 2767 }
+ − 2768 }
+ − 2769 else {
+ − 2770 die "The strand information was neither + nor -: $strand\n";
+ − 2771 }
+ − 2772 }
+ − 2773 }
+ − 2774
+ − 2775
+ − 2776
+ − 2777 #######################################################################################################################################
+ − 2778 ### bismark2bedGaph section - START
+ − 2779 #######################################################################################################################################
+ − 2780
+ − 2781 sub process_bedGraph_output{
+ − 2782 warn "="x64,"\n";
+ − 2783 warn "Methylation information will now be written into a bedGraph file\n";
+ − 2784 warn "="x64,"\n\n";
+ − 2785 sleep (2);
+ − 2786
+ − 2787 ### Closing all filehandles so that the Bismark methtylation extractor output doesn't get truncated due to buffering issues
+ − 2788 foreach my $fh (keys %fhs) {
+ − 2789 if ($fh =~ /^[1230]$/) {
+ − 2790 foreach my $context (keys %{$fhs{$fh}}) {
+ − 2791 close $fhs{$fh}->{$context} or die $!;
+ − 2792 }
+ − 2793 } else {
+ − 2794 close $fhs{$fh} or die $!;
+ − 2795 }
+ − 2796 }
+ − 2797
+ − 2798 ### deciding which files to use for bedGraph conversion
+ − 2799 foreach my $filename (@sorting_files){
+ − 2800 # warn "$filename\n";
+ − 2801 if ($filename =~ /\//){ # if files are in a different output folder we extract the filename again
+ − 2802 $filename =~ s/.*\///; # replacing everything up to the last slash in the filename
+ − 2803 # warn "$filename\n";
+ − 2804 }
+ − 2805
+ − 2806 if ($CX_context){
+ − 2807 push @bedfiles,$filename;
+ − 2808 }
+ − 2809 else{ ## CpG context only (default)
+ − 2810 if ($filename =~ /^CpG_/){
+ − 2811 push @bedfiles,$filename;
+ − 2812 }
+ − 2813 else{
+ − 2814 # skipping CHH or CHG files
+ − 2815 }
+ − 2816 }
+ − 2817 }
+ − 2818
+ − 2819 warn "Using the following files as Input:\n";
+ − 2820 print join ("\t",@bedfiles),"\n\n";
+ − 2821 sleep (2);
+ − 2822
+ − 2823 my %temp_fhs;
+ − 2824 my @temp_files; # writing all context files (default CpG only) to these files prior to sorting
+ − 2825
+ − 2826 ### changing to the output directory
+ − 2827 unless ($output_dir eq ''){ # default
+ − 2828 chdir $output_dir or die "Failed to change directory to $output_dir\n";
+ − 2829 warn "Changed directory to $output_dir\n";
+ − 2830 }
+ − 2831
+ − 2832 foreach my $infile (@bedfiles) {
+ − 2833
+ − 2834 if ($remove) {
+ − 2835 warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n";
+ − 2836
+ − 2837 open (READ,$infile) or die $!;
+ − 2838
+ − 2839 my $removed_spaces_outfile = $infile;
+ − 2840 $removed_spaces_outfile =~ s/$/.spaces_removed.txt/;
+ − 2841
+ − 2842 open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n";
+ − 2843
+ − 2844 unless ($no_header){
+ − 2845 $_ = <READ>; ### Bismark version header
+ − 2846 print REM $_; ### Bismark version header
+ − 2847 }
+ − 2848
+ − 2849 while (<READ>) {
+ − 2850 chomp;
+ − 2851 my ($id,$strand,$chr,$pos,$context) = (split (/\t/));
+ − 2852 $id =~ s/\s+/_/g;
+ − 2853 print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n";
+ − 2854 }
+ − 2855
+ − 2856 close READ or die $!;
+ − 2857 close REM or die $!;
+ − 2858
+ − 2859 ### changing the infile name to the new file without spaces
+ − 2860 $infile = $removed_spaces_outfile;
+ − 2861 }
+ − 2862
+ − 2863 warn "Now writing methylation information for file $infile to individual files for each chromosome\n";
+ − 2864 open (IN,$infile) or die $!;
+ − 2865
+ − 2866 ## always ignoring the version header
+ − 2867 unless ($no_header){
+ − 2868 $_ = <IN>; ### Bismark version header
+ − 2869 }
+ − 2870
+ − 2871 while (<IN>) {
+ − 2872 chomp;
+ − 2873 my ($chr) = (split (/\t/))[2];
+ − 2874
+ − 2875 unless (exists $temp_fhs{$chr}) {
+ − 2876 open ($temp_fhs{$chr},'>','chr'.$chr.'.meth_extractor.temp') or die "Failed to open filehandle: $!";
+ − 2877 }
+ − 2878 print {$temp_fhs{$chr}} "$_\n";
+ − 2879 }
+ − 2880
+ − 2881 warn "Finished writing out individual chromosome files for $infile\n";
+ − 2882 }
+ − 2883 warn "\n";
+ − 2884
+ − 2885 @temp_files = <*.meth_extractor.temp>;
+ − 2886
+ − 2887 warn "Collecting temporary chromosome file information...\n";
+ − 2888 sleep (1);
+ − 2889 warn "processing the following input file(s):\n";
+ − 2890 warn join ("\n",@temp_files),"\n\n";
+ − 2891 sleep (1);
+ − 2892
+ − 2893 foreach my $in (@temp_files) {
+ − 2894 warn "Sorting input file $in by positions\n";
+ − 2895 open my $ifh, "sort -k3,3 -k4,4n $in |" or die "Input file could not be sorted. $!";
+ − 2896 # print "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n";
+ − 2897
+ − 2898 ############################################# m.a.bentley - moved the variables out of the while loop to hold the current line data {
+ − 2899
+ − 2900 my $name;
+ − 2901 my $meth_state;
+ − 2902 my $chr = "";
+ − 2903 my $pos = 0;
+ − 2904 my $meth_state2;
+ − 2905
+ − 2906 my $last_pos;
+ − 2907 my $last_chr;
+ − 2908
+ − 2909 ############################################# }
+ − 2910
+ − 2911 while (my $line = <$ifh>) {
+ − 2912 next if $line =~ /^Bismark/;
+ − 2913 chomp $line;
+ − 2914
+ − 2915 ########################################### m.a.bentley - (1) set the last_chr and last_pos variables early in the while loop, before the line split (2) removed unnecessary setting of same variables in if statement {
+ − 2916
+ − 2917 $last_chr = $chr;
+ − 2918 $last_pos = $pos;
+ − 2919 ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
+ − 2920
+ − 2921 if (($last_pos ne $pos) || ($last_chr ne $chr)) {
+ − 2922 generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
+ − 2923 @methylcalls = qw (0 0 0);
+ − 2924 }
+ − 2925
+ − 2926 ############################################# }
+ − 2927
+ − 2928 my $validated = validate_methylation_call($meth_state, $meth_state2);
+ − 2929 unless($validated){
+ − 2930 warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
+ − 2931 next;
+ − 2932 }
+ − 2933 if ($meth_state eq "+") {
+ − 2934 $methylcalls[0]++;
+ − 2935 $methylcalls[2]++;
+ − 2936 } else {
+ − 2937 $methylcalls[1]++;
+ − 2938 $methylcalls[2]++;
+ − 2939 }
+ − 2940 }
+ − 2941
+ − 2942 ############################################# m.a.bentley - set the last_chr and last_pos variables for the last line in the file (outside the while loop's scope using the method i've implemented) {
+ − 2943
+ − 2944 $last_chr = $chr;
+ − 2945 $last_pos = $pos;
+ − 2946 if ($methylcalls[2] > 0) {
+ − 2947 generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
+ − 2948 }
+ − 2949 ############################################# }
+ − 2950
+ − 2951 close $ifh or die $!;
+ − 2952
+ − 2953 @methylcalls = qw (0 0 0); # resetting @methylcalls
+ − 2954
+ − 2955 ### deleting temporary files
+ − 2956 my $delete = unlink $in;
+ − 2957 if ($delete) {
+ − 2958 warn "Successfully deleted the temporary input file $in\n\n";
+ − 2959 }
+ − 2960 else {
+ − 2961 warn "The temporary inputfile $in could not be deleted $!\n\n";
+ − 2962 }
+ − 2963 }
+ − 2964 }
+ − 2965
+ − 2966 sub generate_output{
+ − 2967 my $methcount = $methylcalls[0];
+ − 2968 my $nonmethcount = $methylcalls[1];
+ − 2969 my $totalcount = $methylcalls[2];
+ − 2970 my $last_chr = shift;
+ − 2971 my $last_pos = shift;
+ − 2972 croak "Should not be generating output if there's no reads to this region" unless $totalcount > 0;
+ − 2973 croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if $totalcount != ($methcount + $nonmethcount);
+ − 2974
+ − 2975 ############################################# m.a.bentley - declare a new variable 'bed_pos' to distinguish from bismark positions (-1) - previous scripts modified the last_pos variable earlier in the script leading to problems in meth % calculation {
+ − 2976
+ − 2977 my $bed_pos = $last_pos -1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based.
+ − 2978 my $meth_percentage;
+ − 2979 ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef);
+ − 2980 # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/;
+ − 2981 if (defined $meth_percentage){
+ − 2982 if ($counts){
+ − 2983 print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
+ − 2984 }
+ − 2985 else{
+ − 2986 print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\n";
+ − 2987 }
+ − 2988 }
+ − 2989 ############################################# }
+ − 2990 }
+ − 2991
+ − 2992 sub validate_methylation_call{
+ − 2993 my $meth_state = shift;
+ − 2994 croak "Missing (+/-) methylation call" unless defined $meth_state;
+ − 2995 my $meth_state2 = shift;
+ − 2996 croak "Missing alphabetical methylation call" unless defined $meth_state2;
+ − 2997 my $is_consistent;
+ − 2998 ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2))
+ − 2999 : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2));
+ − 3000 return 1 if $is_consistent;
+ − 3001 return 0;
+ − 3002 }
+ − 3003
+ − 3004 sub check_CpG_methylation_call{
+ − 3005 my $meth1 = shift;
+ − 3006 my $meth2 = shift;
+ − 3007 return 1 if($meth1 eq "+" && $meth2 eq "Z");
+ − 3008 return 1 if($meth1 eq "-" && $meth2 eq "z");
+ − 3009 return 0;
+ − 3010 }
+ − 3011
+ − 3012 sub check_nonCpG_methylation_call{
+ − 3013 my $meth1 = shift;
+ − 3014 my $meth2 = shift;
+ − 3015 return 1 if($meth1 eq "+" && $meth2 eq "C");
+ − 3016 return 1 if($meth1 eq "+" && $meth2 eq "X");
+ − 3017 return 1 if($meth1 eq "+" && $meth2 eq "H");
+ − 3018 return 1 if($meth1 eq "-" && $meth2 eq "c");
+ − 3019 return 1 if($meth1 eq "-" && $meth2 eq "x");
+ − 3020 return 1 if($meth1 eq "-" && $meth2 eq "h");
+ − 3021 return 0;
+ − 3022 }
+ − 3023
+ − 3024 #######################################################################################################################################
+ − 3025 ### bismark2bedGaph section - END
+ − 3026 #######################################################################################################################################
+ − 3027
+ − 3028
+ − 3029
+ − 3030
+ − 3031
+ − 3032
+ − 3033 #######################################################################################################################################
+ − 3034 ### genome-wide cytosine methylation report - START
+ − 3035 #######################################################################################################################################
+ − 3036
+ − 3037 sub generate_genome_wide_cytosine_report {
+ − 3038
+ − 3039 warn "="x78,"\n";
+ − 3040 warn "Methylation information will now be written into a genome-wide cytosine report\n";
+ − 3041 warn "="x78,"\n\n";
+ − 3042 sleep (2);
+ − 3043
+ − 3044 ### changing to the output directory again
+ − 3045 unless ($output_dir eq ''){ # default
+ − 3046 chdir $output_dir or die "Failed to change directory to $output_dir\n";
+ − 3047 # warn "Changed directory to $output_dir\n";
+ − 3048 }
+ − 3049
+ − 3050 my $in = shift;
+ − 3051 open (IN,$in) or die $!;
+ − 3052
+ − 3053 my $cytosine_out = shift;
+ − 3054
+ − 3055 if ($CX_context){
+ − 3056 $cytosine_out =~ s/$/genome-wide_CX_report.txt/;
+ − 3057 }
+ − 3058 else{
+ − 3059 $cytosine_out =~ s/$/genome_wide_CpG_report.txt/;
+ − 3060 }
+ − 3061
+ − 3062 ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands
+ − 3063 unless ($split_by_chromosome){ ### writing all output to a single file (default)
+ − 3064 open (CYT,'>',$cytosine_out) or die $!;
+ − 3065 warn "Writing genome-wide cytosine report to: $cytosine_out\n";
+ − 3066 sleep (3);
+ − 3067 }
+ − 3068
+ − 3069 my $last_chr;
+ − 3070 my %chr; # storing reads for one chromosome at a time
+ − 3071
+ − 3072 my $count = 0;
+ − 3073 while (<IN>){
+ − 3074 chomp;
+ − 3075 ++$count;
+ − 3076 my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/);
+ − 3077
+ − 3078 # defining the first chromosome
+ − 3079 unless (defined $last_chr){
+ − 3080 $last_chr = $chr;
+ − 3081 # warn "Storing all covered cytosine positions for chromosome: $chr\n";
+ − 3082 }
+ − 3083
+ − 3084 if ($chr eq $last_chr){
+ − 3085 $chr{$chr}->{$start}->{meth} = $meth;
+ − 3086 $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
+ − 3087 }
+ − 3088 else{
+ − 3089 warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
+ − 3090
+ − 3091 if ($split_by_chromosome){ ## writing output to 1 file per chromosome
+ − 3092 my $chromosome_out = $cytosine_out;
+ − 3093 $chromosome_out =~ s/txt$/chr${last_chr}.txt/;
+ − 3094 open (CYT,'>',$chromosome_out) or die $!;
+ − 3095 }
+ − 3096
+ − 3097 while ( $chromosomes{$last_chr} =~ /([CG])/g){
+ − 3098
+ − 3099 my $tri_nt = '';
+ − 3100 my $context = '';
+ − 3101 my $pos = pos$chromosomes{$last_chr};
+ − 3102
+ − 3103 my $strand;
+ − 3104 my $meth = 0;
+ − 3105 my $nonmeth = 0;
+ − 3106
+ − 3107 if ($1 eq 'C'){ # C on forward strand
+ − 3108 $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
+ − 3109 $strand = '+';
+ − 3110 }
+ − 3111 elsif ($1 eq 'G'){ # C on reverse strand
+ − 3112 $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
+ − 3113 $tri_nt = reverse $tri_nt;
+ − 3114 $tri_nt =~ tr/ACTG/TGAC/;
+ − 3115 $strand = '-';
+ − 3116 }
+ − 3117 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
+ − 3118
+ − 3119 if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based!
+ − 3120 $meth = $chr{$last_chr}->{$pos-1}->{meth};
+ − 3121 $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth};
+ − 3122 }
+ − 3123
+ − 3124 ### determining cytosine context
+ − 3125 if ($tri_nt =~ /^CG/){
+ − 3126 $context = 'CG';
+ − 3127 }
+ − 3128 elsif ($tri_nt =~ /^C.{1}G$/){
+ − 3129 $context = 'CHG';
+ − 3130 }
+ − 3131 elsif ($tri_nt =~ /^C.{2}$/){
+ − 3132 $context = 'CHH';
+ − 3133 }
+ − 3134 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
+ − 3135 warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n";
+ − 3136 next;
+ − 3137 }
+ − 3138
+ − 3139 if ($CpG_only){
+ − 3140 if ($tri_nt =~ /^CG/){ # CpG context is the default
+ − 3141 if ($zero){ # zero based coordinates
+ − 3142 $pos -= 1;
+ − 3143 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+ − 3144 }
+ − 3145 else{ # default
+ − 3146 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+ − 3147 }
+ − 3148 }
+ − 3149 }
+ − 3150 else{ ## all cytosines, specified with --CX
+ − 3151 if ($zero){ # zero based coordinates
+ − 3152 $pos -= 1;
+ − 3153 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+ − 3154 }
+ − 3155 else{ # default
+ − 3156 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+ − 3157 }
+ − 3158 }
+ − 3159 }
+ − 3160
+ − 3161 %chr = (); # resetting the hash
+ − 3162
+ − 3163 # new first entry
+ − 3164 $last_chr = $chr;
+ − 3165 $chr{$chr}->{$start}->{meth} = $meth;
+ − 3166 $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
+ − 3167 }
+ − 3168 }
+ − 3169
+ − 3170 # Last found chromosome
+ − 3171 warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
+ − 3172
+ − 3173 if ($split_by_chromosome){ ## writing output to 1 file per chromosome
+ − 3174 my $chromosome_out = $cytosine_out;
+ − 3175 $chromosome_out =~ s/txt$/chr${last_chr}.txt/;
+ − 3176 open (CYT,'>',$chromosome_out) or die $!;
+ − 3177 }
+ − 3178
+ − 3179 while ( $chromosomes{$last_chr} =~ /([CG])/g){
+ − 3180
+ − 3181 my $tri_nt;
+ − 3182 my $context;
+ − 3183 my $pos = pos$chromosomes{$last_chr};
+ − 3184
+ − 3185 my $strand;
+ − 3186 my $meth = 0;
+ − 3187 my $nonmeth = 0;
+ − 3188
+ − 3189 if ($1 eq 'C'){ # C on forward strand
+ − 3190 $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
+ − 3191 $strand = '+';
+ − 3192 }
+ − 3193 elsif ($1 eq 'G'){ # C on reverse strand
+ − 3194 $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
+ − 3195 $tri_nt = reverse $tri_nt;
+ − 3196 $tri_nt =~ tr/ACTG/TGAC/;
+ − 3197 $strand = '-';
+ − 3198 }
+ − 3199
+ − 3200 if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based!
+ − 3201 $meth = $chr{$last_chr}->{$pos-1}->{meth};
+ − 3202 $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth};
+ − 3203 }
+ − 3204
+ − 3205 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
+ − 3206
+ − 3207 ### determining cytosine context
+ − 3208 if ($tri_nt =~ /^CG/){
+ − 3209 $context = 'CG';
+ − 3210 }
+ − 3211 elsif ($tri_nt =~ /^C.{1}G$/){
+ − 3212 $context = 'CHG';
+ − 3213 }
+ − 3214 elsif ($tri_nt =~ /^C.{2}$/){
+ − 3215 $context = 'CHH';
+ − 3216 }
+ − 3217 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
+ − 3218 warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n";
+ − 3219 next;
+ − 3220 }
+ − 3221
+ − 3222 if ($CpG_only){
+ − 3223 if ($tri_nt =~ /^CG/){ # CpG context is the default
+ − 3224 if ($zero){ # zero-based coordinates
+ − 3225 $pos -= 1;
+ − 3226 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+ − 3227 }
+ − 3228 else{ # default
+ − 3229 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+ − 3230 }
+ − 3231 }
+ − 3232 }
+ − 3233 else{ ## all cytosines, specified with --CX
+ − 3234 if ($zero){ # zero based coordinates
+ − 3235 $pos -= 1;
+ − 3236 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+ − 3237 }
+ − 3238 else{ # default
+ − 3239 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
+ − 3240 }
+ − 3241 }
+ − 3242 }
+ − 3243 close CYT or die $!;
+ − 3244 }
+ − 3245
+ − 3246
+ − 3247 sub read_genome_into_memory{
+ − 3248
+ − 3249 ## reading in and storing the specified genome in the %chromosomes hash
+ − 3250 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
+ − 3251 warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
+ − 3252
+ − 3253 my @chromosome_filenames = <*.fa>;
+ − 3254
+ − 3255 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
+ − 3256 unless (@chromosome_filenames){
+ − 3257 @chromosome_filenames = <*.fasta>;
+ − 3258 }
+ − 3259 unless (@chromosome_filenames){
+ − 3260 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
+ − 3261 }
+ − 3262
+ − 3263 foreach my $chromosome_filename (@chromosome_filenames){
+ − 3264
+ − 3265 # skipping the tophat entire mouse genome fasta file
+ − 3266 next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa');
+ − 3267
+ − 3268 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
+ − 3269 ### first line needs to be a fastA header
+ − 3270 my $first_line = <CHR_IN>;
+ − 3271 chomp $first_line;
+ − 3272 $first_line =~ s/\r//; # removing /r carriage returns
+ − 3273
+ − 3274 ### Extracting chromosome name from the FastA header
+ − 3275 my $chromosome_name = extract_chromosome_name($first_line);
+ − 3276
+ − 3277 my $sequence;
+ − 3278 while (<CHR_IN>){
+ − 3279 chomp;
+ − 3280 $_ =~ s/\r//; # removing /r carriage returns
+ − 3281
+ − 3282 if ($_ =~ /^>/){
+ − 3283 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
+ − 3284 if (exists $chromosomes{$chromosome_name}){
+ − 3285 warn "chr $chromosome_name (",length $sequence ," bp)\n";
+ − 3286 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
+ − 3287 }
+ − 3288 else {
+ − 3289 if (length($sequence) == 0){
+ − 3290 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
+ − 3291 }
+ − 3292 warn "chr $chromosome_name (",length $sequence ," bp)\n";
+ − 3293 $chromosomes{$chromosome_name} = $sequence;
+ − 3294 }
+ − 3295 ### resetting the sequence variable
+ − 3296 $sequence = '';
+ − 3297 ### setting new chromosome name
+ − 3298 $chromosome_name = extract_chromosome_name($_);
+ − 3299 }
+ − 3300 else{
+ − 3301 $sequence .= uc$_;
+ − 3302 }
+ − 3303 }
+ − 3304
+ − 3305 if (exists $chromosomes{$chromosome_name}){
+ − 3306 warn "chr $chromosome_name (",length $sequence ," bp)\t";
+ − 3307 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
+ − 3308 }
+ − 3309 else{
+ − 3310 if (length($sequence) == 0){
+ − 3311 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
+ − 3312 }
+ − 3313 warn "chr $chromosome_name (",length $sequence ," bp)\n";
+ − 3314 $chromosomes{$chromosome_name} = $sequence;
+ − 3315 }
+ − 3316 }
+ − 3317 warn "\n";
+ − 3318 chdir $parent_dir or die "Failed to move to directory $parent_dir\n";
+ − 3319 }
+ − 3320
+ − 3321 sub extract_chromosome_name {
+ − 3322 ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well
+ − 3323 my $fasta_header = shift;
+ − 3324 if ($fasta_header =~ s/^>//){
+ − 3325 my ($chromosome_name) = split (/\s+/,$fasta_header);
+ − 3326 return $chromosome_name;
+ − 3327 }
+ − 3328 else{
+ − 3329 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
+ − 3330 }
+ − 3331 }
+ − 3332
+ − 3333 #######################################################################################################################################
+ − 3334 ### genome-wide cytosine methylation report - END
+ − 3335 #######################################################################################################################################
+ − 3336
+ − 3337
+ − 3338
+ − 3339
+ − 3340 sub print_helpfile{
+ − 3341
+ − 3342 print << 'HOW_TO';
+ − 3343
+ − 3344
+ − 3345 DESCRIPTION
+ − 3346
+ − 3347 The following is a brief description of all options to control the Bismark
+ − 3348 methylation extractor. The script reads in a bisulfite read alignment results file
+ − 3349 produced by the Bismark bisulfite mapper and extracts the methylation information
+ − 3350 for individual cytosines. This information is found in the methylation call field
+ − 3351 which can contain the following characters:
+ − 3352
+ − 3353 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ − 3354 ~~~ X for methylated C in CHG context (was protected) ~~~
+ − 3355 ~~~ x for not methylated C CHG (was converted) ~~~
+ − 3356 ~~~ H for methylated C in CHH context (was protected) ~~~
+ − 3357 ~~~ h for not methylated C in CHH context (was converted) ~~~
+ − 3358 ~~~ Z for methylated C in CpG context (was protected) ~~~
+ − 3359 ~~~ z for not methylated C in CpG context (was converted) ~~~
+ − 3360 ~~~ . for any bases not involving cytosines ~~~
+ − 3361 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ − 3362
+ − 3363 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
+ − 3364 context (this distinction is actually already made in Bismark itself). As the methylation
+ − 3365 information for every C analysed can produce files which easily have tens or even hundreds of
+ − 3366 millions of lines, file sizes can become very large and more difficult to handle. The C
+ − 3367 methylation info additionally splits cytosine methylation calls up into one of the four possible
+ − 3368 strands a given bisulfite read aligned against:
+ − 3369
+ − 3370 OT original top strand
+ − 3371 CTOT complementary to original top strand
+ − 3372
+ − 3373 OB original bottom strand
+ − 3374 CTOB complementary to original bottom strand
+ − 3375
+ − 3376 Thus, by default twelve individual output files are being generated per input file (unless
+ − 3377 --comprehensive is specified, see below). The output files can be imported into a genome
+ − 3378 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
+ − 3379 unless the bisulfite reads were generated preserving directionality it doesn't make any
+ − 3380 sense to look at the data in a strand-specific manner). Strand-specific output files can
+ − 3381 optionally be skipped, in which case only three output files for CpG, CHG or CHH context
+ − 3382 will be generated. For both the strand-specific and comprehensive outputs there is also
+ − 3383 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
+ − 3384
+ − 3385
+ − 3386 The output files are in the following format (tab delimited):
+ − 3387
+ − 3388 <sequence_id> <strand> <chromosome> <position> <methylation call>
+ − 3389
+ − 3390
+ − 3391 USAGE: methylation_extractor [options] <filenames>
+ − 3392
+ − 3393
+ − 3394 ARGUMENTS:
+ − 3395
+ − 3396 <filenames> A space-separated list of Bismark result files in SAM format from
+ − 3397 which methylation information is extracted for every cytosine in
+ − 3398 the reads. For alignment files in the older custom Bismark output
+ − 3399 see option '--vanilla'.
+ − 3400
+ − 3401 OPTIONS:
+ − 3402
+ − 3403 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end
+ − 3404 read data. Specifying either --single-end or --paired-end is
+ − 3405 mandatory.
+ − 3406
+ − 3407 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
+ − 3408 read data. Specifying either --paired-end or --single-end is
+ − 3409 mandatory.
+ − 3410
+ − 3411 --vanilla The Bismark result input file(s) are in the old custom Bismark format
+ − 3412 (up to version 0.5.x) and not in SAM format which is the default as
+ − 3413 of Bismark version 0.6.x or higher. Default: OFF.
+ − 3414
+ − 3415 --no_overlap For paired-end reads it is theoretically possible that read_1 and
+ − 3416 read_2 overlap. This option avoids scoring overlapping methylation
+ − 3417 calls twice (only methylation calls of read 1 are used for in the process
+ − 3418 since read 1 has historically higher quality basecalls than read 2).
+ − 3419 Whilst this option removes a bias towards more methylation calls
+ − 3420 in the center of sequenced fragments it may de facto remove a sizable
+ − 3421 proportion of the data. This option is highly recommended for paired-end
+ − 3422 data.
+ − 3423
+ − 3424 --ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the
+ − 3425 methylation call string. This can remove e.g. a restriction enzyme site
+ − 3426 at the start of each read.
+ − 3427
+ − 3428 --comprehensive Specifying this option will merge all four possible strand-specific
+ − 3429 methylation info into context-dependent output files. The default
+ − 3430 contexts are:
+ − 3431 - CpG context
+ − 3432 - CHG context
+ − 3433 - CHH context
+ − 3434
+ − 3435 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
+ − 3436 strand-specific output files (default) for Cs in
+ − 3437 - CpG context
+ − 3438 - non-CpG context
+ − 3439
+ − 3440 --report Prints out a short methylation summary as well as the paramaters used to run
+ − 3441 this script.
+ − 3442
+ − 3443 --no_header Suppresses the Bismark version header line in all output files for more convenient
+ − 3444 batch processing.
+ − 3445
+ − 3446 -o/--output DIR Allows specification of a different output directory (absolute or relative
+ − 3447 path). If not specified explicitely, the output will be written to the current directory.
+ − 3448
+ − 3449 --version Displays version information.
+ − 3450
+ − 3451 -h/--help Displays this help file and exits.
+ − 3452
+ − 3453
+ − 3454
+ − 3455 bedGraph specific options:
+ − 3456
+ − 3457 --bedGraph After finishing the methylation extraction, the methylation output is written into a
+ − 3458 sorted bedGraph file that reports the position of a given cytosine and its methylation
+ − 3459 state (in %, seem details below). The methylation extractor output is temporarily split up into
+ − 3460 temporary files, one per chromosome (written into the current directory or folder
+ − 3461 specified with -o/--output); these temp files are then used for sorting and deleted
+ − 3462 afterwards. By default, only cytosines in CpG context will be sorted. The option
+ − 3463 '--CX_context' may be used to report all cyosines irrespective of sequence context
+ − 3464 (this will take MUCH longer!).
+ − 3465
+ − 3466
+ − 3467 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide
+ − 3468 before its methylation percentage is reported. Default: 1.
+ − 3469
+ − 3470 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
+ − 3471
+ − 3472
+ − 3473 --counts Adds two additional columns to the output file to enable further calculations:
+ − 3474 col 5: number of methylated calls
+ − 3475 col 6: number of unmethylated calls
+ − 3476 This option is required if '--cytosine_report' is specified (and will be set automatically if
+ − 3477 necessary).
+ − 3478
+ − 3479 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
+ − 3480 in the experiment irrespective of its sequence context. This applies to both forward and
+ − 3481 reverse strands. Please be aware that this option may generate large temporary and output files
+ − 3482 and may take a long time to sort (up to many hours). Default: OFF.
+ − 3483 (i.e. Default = CpG context only).
+ − 3484
+ − 3485
+ − 3486
+ − 3487 Genome-wide cytosine methylation report specific options:
+ − 3488
+ − 3489 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a
+ − 3490 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based
+ − 3491 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all
+ − 3492 cytosine context is optional). The output considers all Cs on both forward and reverse strands and
+ − 3493 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not
+ − 3494 covered).
+ − 3495
+ − 3496 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of
+ − 3497 its context. This applies to both forward and reverse strands. Please be aware that this will
+ − 3498 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
+ − 3499 Default: OFF (i.e. Default = CpG context only).
+ − 3500
+ − 3501 --zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF.
+ − 3502
+ − 3503 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
+ − 3504 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
+ − 3505
+ − 3506 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files
+ − 3507 will be named to include the input filename and the chromosome number.
+ − 3508
+ − 3509
+ − 3510
+ − 3511 OUTPUT:
+ − 3512
+ − 3513 The bismark_methylation_extractor output is in the form:
+ − 3514 ========================================================
+ − 3515 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call>
+ − 3516
+ − 3517 * Methylated cytosines receive a '+' orientation,
+ − 3518 * Unmethylated cytosines receive a '-' orientation.
+ − 3519
+ − 3520
+ − 3521
+ − 3522 The bedGraph output (optional) looks like this (tab-delimited):
+ − 3523 ===============================================================
+ − 3524 <chromosome> <start position> <end position> <methylation percentage>
+ − 3525
+ − 3526
+ − 3527
+ − 3528 The genome-wide cytosine methylation output file is tab-delimited in the following format:
+ − 3529 ==========================================================================================
+ − 3530 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context>
+ − 3531
+ − 3532
+ − 3533
+ − 3534 This script was last modified on 02 Oct 2012.
+ − 3535
+ − 3536 HOW_TO
+ − 3537 }