comparison bismark_methylation_extractor @ 0:62c6da72dd4a draft

Uploaded
author bgruening
date Sat, 06 Jul 2013 09:57:36 -0400
parents
children 91f07ff056ca
comparison
equal deleted inserted replaced
-1:000000000000 0:62c6da72dd4a
1 #!/usr/bin/perl
2 use warnings;
3 use strict;
4 $|++;
5 use Getopt::Long;
6 use Cwd;
7 use Carp;
8 use FindBin qw($Bin);
9 use lib "$Bin/../lib";
10
11 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
12
13 ## This program is free software: you can redistribute it and/or modify
14 ## it under the terms of the GNU General Public License as published by
15 ## the Free Software Foundation, either version 3 of the License, or
16 ## (at your option) any later version.
17
18 ## This program is distributed in the hope that it will be useful,
19 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ## GNU General Public License for more details.
22
23 ## You should have received a copy of the GNU General Public License
24 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
25
26 my @filenames; # input files
27 my %counting;
28 my $parent_dir = getcwd();
29
30 my %fhs;
31
32 my $version = 'v0.7.11';
33 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip) = process_commandline();
34
35
36 ### only needed for bedGraph output
37 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files
38 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
39 my @bedfiles;
40
41 ### only needed for genome-wide cytosine methylation report
42 my %chromosomes;
43
44 ##############################################################################################
45 ### Summarising Run Parameters
46 ##############################################################################################
47
48 ### METHYLATION EXTRACTOR
49
50 warn "Summarising Bismark methylation extractor parameters:\n";
51 warn '='x63,"\n";
52
53 if ($single){
54 if ($vanilla){
55 warn "Bismark single-end vanilla format specified\n";
56 }
57 else{
58 warn "Bismark single-end SAM format specified (default)\n"; # default
59 }
60 }
61 elsif ($paired){
62 if ($vanilla){
63 warn "Bismark paired-end vanilla format specified\n";
64 }
65 else{
66 warn "Bismark paired-end SAM format specified (default)\n"; # default
67 }
68 }
69
70 if ($ignore){
71 warn "First $ignore bases will be disregarded when processing the methylation call string\n";
72 }
73
74 if ($full){
75 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
76 }
77 if ($merge_non_CpG){
78 warn "Merge CHG and CHH context to non-CpG context specified\n";
79 }
80 ### output directory
81 if ($output_dir eq ''){
82 warn "Output will be written to the current directory ('$parent_dir')\n";
83 }
84 else{
85 warn "Output path specified as: $output_dir\n";
86 }
87
88
89 sleep (1);
90
91 ### BEDGRAPH
92
93 if ($bedGraph){
94 warn "\n\nSummarising bedGraph parameters:\n";
95 warn '='x63,"\n";
96
97 if ($counts){
98 warn "Generating additional output in bedGraph format including methylating counts (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>)\n";
99 }
100 else{
101 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n";
102 }
103
104 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n";
105
106 if ($CX_context){
107 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n";
108 }
109 else{ # default
110 $CpG_only = 1;
111 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n";
112 }
113
114 if ($remove){
115 warn "White spaces in read ID names will be removed prior to sorting\n";
116 }
117
118 if (defined $sort_size){
119 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n";
120 }
121 else{
122 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n";
123 }
124
125
126
127 sleep (1);
128
129 if ($cytosine_report){
130 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n";
131 warn '='x63,"\n";
132 warn "Generating comprehensive genome-wide cytosine report\n(output format: <Chromosome> <Position> <Strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> )\n";
133
134
135 if ($CX_context){
136 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n";
137 }
138 else{ # default
139 $CpG_only = 1;
140 warn "Reporting cytosine methylation in CpG context only (default)\n";
141 }
142
143 if ($split_by_chromosome){
144 warn "Splitting the cytosine report output up into individual files for each chromosome\n";
145 }
146
147 ### Zero-based coordinates
148 if ($zero){
149 warn "Using zero-based genomic coordinates (user-defined)\n";
150 }
151 else{ # default, 1-based coords
152 warn "Using 1-based genomic coordinates (default)\n";
153 }
154
155 ### GENOME folder
156 if ($genome_folder){
157 unless ($genome_folder =~/\/$/){
158 $genome_folder =~ s/$/\//;
159 }
160 warn "Genome folder was specified as $genome_folder\n";
161 }
162 else{
163 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/';
164 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n";
165 }
166 sleep (1);
167 }
168 }
169
170 warn "\n";
171 sleep (5);
172
173 ######################################################
174 ### PROCESSING FILES
175 ######################################################
176
177 foreach my $filename (@filenames){
178 # resetting counters and filehandles
179 %fhs = ();
180 %counting =(
181 total_meCHG_count => 0,
182 total_meCHH_count => 0,
183 total_meCpG_count => 0,
184 total_unmethylated_CHG_count => 0,
185 total_unmethylated_CHH_count => 0,
186 total_unmethylated_CpG_count => 0,
187 sequences_count => 0,
188 );
189 @sorting_files = ();
190 @bedfiles = ();
191
192 process_Bismark_results_file($filename);
193
194 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues
195 foreach my $fh (keys %fhs) {
196 if ($fh =~ /^[1230]$/) {
197 foreach my $context (keys %{$fhs{$fh}}) {
198 close $fhs{$fh}->{$context} or die $!;
199
200 }
201 } else {
202 close $fhs{$fh} or die $!;
203 }
204 }
205
206 if ($bedGraph){
207
208 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
209 $out =~ s/gz$//;
210 $out =~ s/sam$//;
211 $out =~ s/bam$//;
212 $out =~ s/txt$//;
213 $out =~ s/$/bedGraph/;
214
215
216
217 my $bedGraph_output = $out;
218 my @args;
219
220 if ($remove){
221 push @args, '--remove';
222 }
223 if ($CX_context){
224 push @args, '--CX_context';
225 }
226 if ($no_header){
227 push @args, '--no_header';
228 }
229 if ($counts){
230 push @args, "--counts";
231 }
232
233 push @args, "--buffer_size $sort_size";
234 push @args, "--cutoff $coverage_threshold";
235 push @args, "--output $bedGraph_output";
236 push @args, "--dir '$output_dir'";
237
238 ### adding all files to be sorted to @args
239 foreach my $f (@sorting_files){
240 push @args, $f;
241 }
242
243 # print join "\t",@args,"\n";
244
245 system ("$Bin/bismark2bedGraph @args");
246
247 warn "Finished BedGraph conversion ...\n\n";
248 sleep(3);
249
250 # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!";
251 # warn "Writing bedGraph to file: $bedGraph_output\n";
252 # process_bedGraph_output();
253 # close OUT or die $!;
254
255 ### genome-wide cytosine methylation report requires bedGraph processing anyway
256 if ($cytosine_report){
257 @args = (); # resetting @args
258 my $cytosine_out = $out;
259 $cytosine_out =~ s/bedGraph$//;
260
261 if ($CX_context){
262 $cytosine_out =~ s/$/CX_report.txt/;
263 }
264 else{
265 $cytosine_out =~ s/$/CpG_report.txt/;
266 }
267
268 push @args, "--output $cytosine_out";
269 push @args, "--dir '$output_dir'";
270 push @args, "--genome '$genome_folder'";
271 push @args, "--parent_dir '$parent_dir'";
272
273 if ($zero){
274 push @args, "--zero";
275 }
276 if ($CX_context){
277 push @args, '--CX_context';
278 }
279 if ($split_by_chromosome){
280 push @args, '--split_by_chromosome';
281 }
282
283 push @args, $bedGraph_output; # this will be the infile
284
285 system ("$Bin/bedGraph2cytosine @args");
286 # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out);
287 warn "\n\nFinished generating genome-wide cytosine report\n\n";
288 }
289 }
290 }
291
292
293 sub process_commandline{
294 my $help;
295 my $single_end;
296 my $paired_end;
297 my $ignore;
298 my $genomic_fasta;
299 my $full;
300 my $report;
301 my $extractor_version;
302 my $no_overlap;
303 my $merge_non_CpG;
304 my $vanilla;
305 my $output_dir;
306 my $no_header;
307 my $bedGraph;
308 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status
309 my $remove;
310 my $counts;
311 my $cytosine_report;
312 my $genome_folder;
313 my $zero;
314 my $CpG_only;
315 my $CX_context;
316 my $split_by_chromosome;
317 my $sort_size;
318 my $samtools_path;
319 my $gzip;
320
321 my $command_line = GetOptions ('help|man' => \$help,
322 'p|paired-end' => \$paired_end,
323 's|single-end' => \$single_end,
324 'fasta' => \$genomic_fasta,
325 'ignore=i' => \$ignore,
326 'comprehensive' => \$full,
327 'report' => \$report,
328 'version' => \$extractor_version,
329 'no_overlap' => \$no_overlap,
330 'merge_non_CpG' => \$merge_non_CpG,
331 'vanilla' => \$vanilla,
332 'o|output=s' => \$output_dir,
333 'no_header' => \$no_header,
334 'bedGraph' => \$bedGraph,
335 "cutoff=i" => \$coverage_threshold,
336 "remove_spaces" => \$remove,
337 "counts" => \$counts,
338 "cytosine_report" => \$cytosine_report,
339 'g|genome_folder=s' => \$genome_folder,
340 "zero_based" => \$zero,
341 "CX|CX_context" => \$CX_context,
342 "split_by_chromosome" => \$split_by_chromosome,
343 "buffer_size=s" => \$sort_size,
344 'samtools_path=s' => \$samtools_path,
345 "gzip" => \$gzip,
346 );
347
348 ### EXIT ON ERROR if there were errors with any of the supplied options
349 unless ($command_line){
350 die "Please respecify command line options\n";
351 }
352
353 ### HELPFILE
354 if ($help){
355 print_helpfile();
356 exit;
357 }
358
359 if ($extractor_version){
360 print << "VERSION";
361
362
363 Bismark Methylation Extractor
364
365 Bismark Extractor Version: $version
366 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
367 www.bioinformatics.babraham.ac.uk/projects/bismark/
368
369
370 VERSION
371 exit;
372 }
373
374
375 ### no files provided
376 unless (@ARGV){
377 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
378 }
379 @filenames = @ARGV;
380
381 warn "\n *** Bismark methylation extractor version $version ***\n\n";
382
383 ### IGNORING <INT> bases at the start of the read when processing the methylation call string
384 unless ($ignore){
385 $ignore = 0;
386 }
387 ### PRINT A REPORT
388 unless ($report){
389 $report = 0;
390 }
391
392 ### OUTPUT DIR PATH
393 if ($output_dir){
394 unless ($output_dir =~ /\/$/){
395 $output_dir =~ s/$/\//;
396 }
397 }
398 else{
399 $output_dir = '';
400 }
401
402 ### NO HEADER
403 unless ($no_header){
404 $no_header = 0;
405 }
406
407 ### OLD (VANILLA) OUTPUT FORMAT
408 unless ($vanilla){
409 $vanilla = 0;
410 }
411
412 if ($single_end){
413 $paired_end = 0; ### SINGLE END ALIGNMENTS
414 }
415 elsif ($paired_end){
416 $single_end = 0; ### PAIRED-END ALIGNMENTS
417 }
418 else{
419 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n";
420 }
421
422 ### NO OVERLAP
423 if ($no_overlap){
424 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
425 }
426 else{
427 $no_overlap = 0;
428 }
429
430 ### COMPREHENSIVE OUTPUT
431 unless ($full){
432 $full = 0;
433 }
434
435 ### MERGE NON-CpG context
436 unless ($merge_non_CpG){
437 $merge_non_CpG = 0;
438 }
439
440 ### remove white spaces in read ID (needed for sorting using the sort command
441 unless ($remove){
442 $remove = 0;
443 }
444
445 ### COVERAGE THRESHOLD FOR bedGraph OUTPUT
446 if (defined $coverage_threshold){
447 unless ($coverage_threshold > 0){
448 die "Please select a coverage greater than 0 (positive integers only)\n";
449 }
450 }
451 else{
452 $coverage_threshold = 1;
453 }
454
455 ### SORT buffer size
456 if (defined $sort_size){
457 unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){
458 die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n";
459 }
460 }
461 else{
462 $sort_size = '2G';
463 }
464
465 if ($zero){
466 die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report);
467 }
468
469 if ($CX_context){
470 die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
471 }
472 else{
473 $CX_context = 0;
474 }
475
476 unless ($counts){
477 $counts = 0;
478 }
479
480 if ($cytosine_report){
481
482 ### GENOME folder
483 if ($genome_folder){
484 unless ($genome_folder =~/\/$/){
485 $genome_folder =~ s/$/\//;
486 }
487 }
488 else{
489 die "Please specify a genome folder to proceed (full path only)\n";
490 }
491
492 unless ($bedGraph){
493 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n";
494 $bedGraph = 1;
495 }
496 unless ($counts){
497 warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n";
498 $counts = 1;
499 }
500 warn "\n";
501 }
502
503 ### PATH TO SAMTOOLS
504 if (defined $samtools_path){
505 # if Samtools was specified as full command
506 if ($samtools_path =~ /samtools$/){
507 if (-e $samtools_path){
508 # Samtools executable found
509 }
510 else{
511 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
512 }
513 }
514 else{
515 unless ($samtools_path =~ /\/$/){
516 $samtools_path =~ s/$/\//;
517 }
518 $samtools_path .= 'samtools';
519 if (-e $samtools_path){
520 # Samtools executable found
521 }
522 else{
523 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
524 }
525 }
526 }
527 # Check whether Samtools is in the PATH if no path was supplied by the user
528 else{
529 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
530 $samtools_path = `which samtools`;
531 chomp $samtools_path;
532 }
533 }
534
535 unless (defined $samtools_path){
536 $samtools_path = '';
537 }
538
539 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip);
540 }
541
542
543 sub process_Bismark_results_file{
544 my $filename = shift;
545
546 warn "\nNow reading in Bismark result file $filename\n\n";
547
548 if ($filename =~ /\.gz$/) {
549 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
550 }
551 elsif ($filename =~ /bam$/) {
552 if ($samtools_path){
553 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
554 }
555 else{
556 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
557 }
558 }
559 else {
560 open (IN,$filename) or die "Can't open file $filename: $!\n";
561 }
562
563 ### Vanilla and SAM output need to read different numbers of header lines
564 if ($vanilla) {
565 my $bismark_version = <IN>; ## discarding the Bismark version info
566 chomp $bismark_version;
567 $bismark_version =~ s/\r//; # replaces \r line feed
568 $bismark_version =~ s/Bismark version: //;
569 if ($bismark_version =~ /^\@/) {
570 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
571 sleep (2);
572 }
573
574 unless ($version eq $bismark_version){
575 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
576 }
577 } else {
578 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
579 # We are reading from it further down
580 }
581
582 my $output_filename = (split (/\//,$filename))[-1];
583
584 ### OPENING OUTPUT-FILEHANDLES
585 if ($report) {
586 my $report_filename = $output_filename;
587 $report_filename =~ s/\.sam$//;
588 $report_filename =~ s/\.txt$//;
589 $report_filename =~ s/$/_splitting_report.txt/;
590 $report_filename = $output_dir . $report_filename;
591 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
592 }
593
594 if ($report) {
595 print REPORT "$output_filename\n\n";
596 print REPORT "Parameters used to extract methylation information:\n";
597 if ($paired) {
598 if ($vanilla) {
599 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
600 } else {
601 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
602 }
603 }
604
605 if ($single) {
606 if ($vanilla) {
607 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
608 } else {
609 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
610 }
611 }
612
613 if ($ignore) {
614 print REPORT "Ignoring first $ignore bases\n";
615 }
616
617 if ($full) {
618 print REPORT "Output specified: comprehensive\n";
619 } else {
620 print REPORT "Output specified: strand-specific (default)\n";
621 }
622
623 if ($no_overlap) {
624 print REPORT "No overlapping methylation calls specified\n";
625 }
626 if ($genomic_fasta) {
627 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
628 }
629 if ($merge_non_CpG) {
630 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
631 }
632
633 print REPORT "\n";
634 }
635
636 ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
637
638 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
639 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
640 if ($full and $merge_non_CpG) {
641 my $cpg_output = my $other_c_output = $output_filename;
642 ### C in CpG context
643 $cpg_output =~ s/^/CpG_context_/;
644 $cpg_output =~ s/sam$/txt/;
645 $cpg_output =~ s/bam$/txt/;
646 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
647 $cpg_output = $output_dir . $cpg_output;
648
649 if ($gzip){
650 $cpg_output .= '.gz';
651 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n";
652 }
653 else{
654 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
655 }
656
657 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
658 push @sorting_files,$cpg_output;
659
660 unless ($no_header) {
661 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
662 }
663
664 ### C in any other context than CpG
665 $other_c_output =~ s/^/Non_CpG_context_/;
666 $other_c_output =~ s/sam$/txt/;
667 $other_c_output =~ s/bam$/txt/;
668 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
669 $other_c_output = $output_dir . $other_c_output;
670
671 if ($gzip){
672 $other_c_output .= '.gz';
673 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n";
674 }
675 else{
676 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n";
677 }
678
679 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n";
680 push @sorting_files,$other_c_output;
681
682
683 unless ($no_header) {
684 print {$fhs{other_context}} "Bismark methylation extractor version $version\n";
685 }
686 }
687
688 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
689 elsif ($merge_non_CpG) {
690
691 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
692
693 ### For cytosines in CpG context
694 $cpg_ot =~ s/^/CpG_OT_/;
695 $cpg_ot =~ s/sam$/txt/;
696 $cpg_ot =~ s/bam$/txt/;
697 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
698 $cpg_ot = $output_dir . $cpg_ot;
699
700 if ($gzip){
701 $cpg_ot .= '.gz';
702 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n";
703 }
704 else{
705 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
706 }
707
708 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
709 push @sorting_files,$cpg_ot;
710
711 unless($no_header){
712 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
713 }
714
715 $cpg_ctot =~ s/^/CpG_CTOT_/;
716 $cpg_ctot =~ s/sam$/txt/;
717 $cpg_ctot =~ s/bam$/txt/;
718 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
719 $cpg_ctot = $output_dir . $cpg_ctot;
720
721 if ($gzip){
722 $cpg_ctot .= '.gz';
723 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n";
724 }
725 else{
726 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
727 }
728
729 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
730 push @sorting_files,$cpg_ctot;
731
732 unless($no_header){
733 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
734 }
735
736 $cpg_ctob =~ s/^/CpG_CTOB_/;
737 $cpg_ctob =~ s/sam$/txt/;
738 $cpg_ctob =~ s/bam$/txt/;
739 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
740 $cpg_ctob = $output_dir . $cpg_ctob;
741
742 if ($gzip){
743 $cpg_ctob .= '.gz';
744 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n";
745 }
746 else{
747 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
748 }
749
750 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
751 push @sorting_files,$cpg_ctob;
752
753 unless($no_header){
754 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
755 }
756
757 $cpg_ob =~ s/^/CpG_OB_/;
758 $cpg_ob =~ s/sam$/txt/;
759 $cpg_ob =~ s/bam$/txt/;
760 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
761 $cpg_ob = $output_dir . $cpg_ob;
762
763 if ($gzip){
764 $cpg_ob .= '.gz';
765 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n";
766 }
767 else{
768 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
769 }
770
771 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
772 push @sorting_files,$cpg_ob;
773
774 unless($no_header){
775 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
776 }
777
778 ### For cytosines in Non-CpG (CC, CT or CA) context
779 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
780
781 $other_c_ot =~ s/^/Non_CpG_OT_/;
782 $other_c_ot =~ s/sam$/txt/;
783 $other_c_ot =~ s/bam$/txt/;
784 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
785 $other_c_ot = $output_dir . $other_c_ot;
786
787 if ($gzip){
788 $other_c_ot .= '.gz';
789 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n";
790 }
791 else{
792 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n";
793 }
794
795 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n";
796 push @sorting_files,$other_c_ot;
797
798 unless($no_header){
799 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n";
800 }
801
802 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
803 $other_c_ctot =~ s/sam$/txt/;
804 $other_c_ctot =~ s/bam$/txt/;
805 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
806 $other_c_ctot = $output_dir . $other_c_ctot;
807
808 if ($gzip){
809 $other_c_ctot .= '.gz';
810 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n";
811 }
812 else{
813 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n";
814 }
815
816 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n";
817 push @sorting_files,$other_c_ctot;
818
819 unless($no_header){
820 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n";
821 }
822
823 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
824 $other_c_ctob =~ s/sam$/txt/;
825 $other_c_ctob =~ s/bam$/txt/;
826 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
827 $other_c_ctob = $output_dir . $other_c_ctob;
828
829 if ($gzip){
830 $other_c_ctob .= '.gz';
831 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n";
832 }
833 else{
834 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n";
835 }
836
837 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n";
838 push @sorting_files,$other_c_ctob;
839
840 unless($no_header){
841 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n";
842 }
843
844 $other_c_ob =~ s/^/Non_CpG_OB_/;
845 $other_c_ob =~ s/sam$/txt/;
846 $other_c_ob =~ s/sam$/txt/;
847 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
848 $other_c_ob = $output_dir . $other_c_ob;
849
850 if ($gzip){
851 $other_c_ob .= '.gz';
852 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n";
853 }
854 else{
855 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n";
856 }
857
858 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n";
859 push @sorting_files,$other_c_ob;
860
861 unless($no_header){
862 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n";
863 }
864 }
865 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
866
867 ### if --comprehensive was specified we are only writing one file per context
868 elsif ($full) {
869 my $cpg_output = my $chg_output = my $chh_output = $output_filename;
870 ### C in CpG context
871 $cpg_output =~ s/^/CpG_context_/;
872 $cpg_output =~ s/sam$/txt/;
873 $cpg_output =~ s/bam$/txt/;
874 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
875 $cpg_output = $output_dir . $cpg_output;
876
877 if ($gzip){
878 $cpg_output .= '.gz';
879 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n";
880 }
881 else{
882 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n";
883 }
884
885 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n";
886 push @sorting_files,$cpg_output;
887
888 unless($no_header){
889 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n";
890 }
891
892 ### C in CHG context
893 $chg_output =~ s/^/CHG_context_/;
894 $chg_output =~ s/sam$/txt/;
895 $chg_output =~ s/bam$/txt/;
896 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
897 $chg_output = $output_dir . $chg_output;
898
899 if ($gzip){
900 $chg_output .= '.gz';
901 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n";
902 }
903 else{
904 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n";
905 }
906
907 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n";
908 push @sorting_files,$chg_output;
909
910 unless($no_header){
911 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n";
912 }
913
914 ### C in CHH context
915 $chh_output =~ s/^/CHH_context_/;
916 $chh_output =~ s/sam$/txt/;
917 $chh_output =~ s/bam$/txt/;
918 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
919 $chh_output = $output_dir . $chh_output;
920
921 if ($gzip){
922 $chh_output .= '.gz';
923 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n";
924 }
925 else{
926 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n";
927 }
928
929 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n";
930 push @sorting_files, $chh_output;
931
932 unless($no_header){
933 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n";
934 }
935 }
936 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
937 else {
938 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
939
940 ### For cytosines in CpG context
941 $cpg_ot =~ s/^/CpG_OT_/;
942 $cpg_ot =~ s/sam$/txt/;
943 $cpg_ot =~ s/bam$/txt/;
944 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
945 $cpg_ot = $output_dir . $cpg_ot;
946
947 if ($gzip){
948 $cpg_ot .= '.gz';
949 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n";
950 }
951 else{
952 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n";
953 }
954
955 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n";
956 push @sorting_files,$cpg_ot;
957
958 unless($no_header){
959 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n";
960 }
961
962 $cpg_ctot =~ s/^/CpG_CTOT_/;
963 $cpg_ctot =~ s/sam$/txt/;
964 $cpg_ctot =~ s/bam$/txt/;
965 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
966 $cpg_ctot = $output_dir . $cpg_ctot;
967
968 if ($gzip){
969 $cpg_ctot .= '.gz';
970 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n";
971 }
972 else{
973 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n";
974 }
975
976 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n";
977 push @sorting_files,$cpg_ctot;
978
979 unless($no_header){
980 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n";
981 }
982
983 $cpg_ctob =~ s/^/CpG_CTOB_/;
984 $cpg_ctob =~ s/sam$/txt/;
985 $cpg_ctob =~ s/bam$/txt/;
986 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
987 $cpg_ctob = $output_dir . $cpg_ctob;
988
989 if ($gzip){
990 $cpg_ctob .= '.gz';
991 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n";
992 }
993 else{
994 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n";
995 }
996
997 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n";
998 push @sorting_files,$cpg_ctob;
999
1000 unless($no_header){
1001 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n";
1002 }
1003
1004 $cpg_ob =~ s/^/CpG_OB_/;
1005 $cpg_ob =~ s/sam$/txt/;
1006 $cpg_ob =~ s/bam$/txt/;
1007 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
1008 $cpg_ob = $output_dir . $cpg_ob;
1009
1010 if ($gzip){
1011 $cpg_ob .= '.gz';
1012 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n";
1013 }
1014 else{
1015 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n";
1016 }
1017
1018 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n";
1019 push @sorting_files,$cpg_ob;
1020
1021 unless($no_header){
1022 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n";
1023 }
1024
1025 ### For cytosines in CHG context
1026 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
1027
1028 $chg_ot =~ s/^/CHG_OT_/;
1029 $chg_ot =~ s/sam$/txt/;
1030 $chg_ot =~ s/bam$/txt/;
1031 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
1032 $chg_ot = $output_dir . $chg_ot;
1033
1034 if ($gzip){
1035 $chg_ot .= '.gz';
1036 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n";
1037 }
1038 else{
1039 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n";
1040 }
1041
1042 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n";
1043 push @sorting_files,$chg_ot;
1044
1045 unless($no_header){
1046 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n";
1047 }
1048
1049 $chg_ctot =~ s/^/CHG_CTOT_/;
1050 $chg_ctot =~ s/sam$/txt/;
1051 $chg_ctot =~ s/bam$/txt/;
1052 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
1053 $chg_ctot = $output_dir . $chg_ctot;
1054
1055 if ($gzip){
1056 $chg_ctot .= '.gz';
1057 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n";
1058 }
1059 else{
1060 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n";
1061 }
1062
1063 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n";
1064 push @sorting_files,$chg_ctot;
1065
1066 unless($no_header){
1067 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n";
1068 }
1069
1070 $chg_ctob =~ s/^/CHG_CTOB_/;
1071 $chg_ctob =~ s/sam$/txt/;
1072 $chg_ctob =~ s/bam$/txt/;
1073 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
1074 $chg_ctob = $output_dir . $chg_ctob;
1075
1076 if ($gzip){
1077 $chg_ctob .= '.gz';
1078 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n";
1079 }
1080 else{
1081 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n";
1082 }
1083
1084 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n";
1085 push @sorting_files,$chg_ctob;
1086
1087 unless($no_header){
1088 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n";
1089 }
1090
1091 $chg_ob =~ s/^/CHG_OB_/;
1092 $chg_ob =~ s/sam$/txt/;
1093 $chg_ob =~ s/bam$/txt/;
1094 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
1095 $chg_ob = $output_dir . $chg_ob;
1096
1097 if ($gzip){
1098 $chg_ob .= '.gz';
1099 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n";
1100 }
1101 else{
1102 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n";
1103 }
1104
1105 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n";
1106 push @sorting_files,$chg_ob;
1107
1108 unless($no_header){
1109 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n";
1110 }
1111
1112 ### For cytosines in CHH context
1113 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
1114
1115 $chh_ot =~ s/^/CHH_OT_/;
1116 $chh_ot =~ s/sam$/txt/;
1117 $chh_ot =~ s/bam$/txt/;
1118 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
1119 $chh_ot = $output_dir . $chh_ot;
1120
1121 if ($gzip){
1122 $chh_ot .= '.gz';
1123 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n";
1124 }
1125 else{
1126 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n";
1127 }
1128
1129 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n";
1130 push @sorting_files,$chh_ot;
1131
1132 unless($no_header){
1133 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n";
1134 }
1135
1136 $chh_ctot =~ s/^/CHH_CTOT_/;
1137 $chh_ctot =~ s/sam$/txt/;
1138 $chh_ctot =~ s/bam$/txt/;
1139 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
1140 $chh_ctot = $output_dir . $chh_ctot;
1141
1142 if ($gzip){
1143 $chh_ctot .= '.gz';
1144 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n";
1145 }
1146 else{
1147 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n";
1148 }
1149
1150 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n";
1151 push @sorting_files,$chh_ctot;
1152
1153 unless($no_header){
1154 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n";
1155 }
1156
1157 $chh_ctob =~ s/^/CHH_CTOB_/;
1158 $chh_ctob =~ s/sam$/txt/;
1159 $chh_ctob =~ s/bam$/txt/;
1160 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
1161 $chh_ctob = $output_dir . $chh_ctob;
1162
1163 if ($gzip){
1164 $chh_ctob .= '.gz';
1165 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n";
1166 }
1167 else{
1168 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n";
1169 }
1170
1171 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n";
1172 push @sorting_files,$chh_ctob;
1173
1174 unless($no_header){
1175 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n";
1176 }
1177
1178 $chh_ob =~ s/^/CHH_OB_/;
1179 $chh_ob =~ s/sam$/txt/;
1180 $chh_ob =~ s/bam$/txt/;
1181 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
1182 $chh_ob = $output_dir . $chh_ob;
1183
1184 if ($gzip){
1185 $chh_ob .= '.gz';
1186 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n";
1187 }
1188 else{
1189 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n";
1190 }
1191
1192 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n";
1193 push @sorting_files,$chh_ob;
1194
1195 unless($no_header){
1196 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n";
1197 }
1198 }
1199
1200 my $methylation_call_strings_processed = 0;
1201 my $line_count = 0;
1202
1203 ### proceeding differently now for single-end or paired-end Bismark files
1204
1205 ### PROCESSING SINGLE-END RESULT FILES
1206 if ($single) {
1207
1208 ### also proceeding differently now for SAM format or vanilla Bismark format files
1209 if ($vanilla) { # old vanilla Bismark output format
1210 while (<IN>) {
1211 ++$line_count;
1212 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
1213
1214 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
1215 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
1216
1217 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
1218 ### last position
1219 chomp $genome_conversion;
1220
1221 my $index;
1222 if ($meth_call) {
1223
1224 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
1225 $index = 0;
1226 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
1227 $index = 1;
1228 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
1229 $index = 3;
1230 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
1231 $index = 2;
1232 } else {
1233 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
1234 }
1235
1236 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
1237 if ($ignore) {
1238 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
1239
1240 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
1241 if ($strand eq '+') {
1242 $start += $ignore;
1243 } elsif ($strand eq '-') {
1244 $start += length($meth_call)-1; ## $meth_call is already shortened!
1245 } else {
1246 die "Alignment did not have proper strand information: $strand\n";
1247 }
1248 }
1249 ### printing out the methylation state of every C in the read
1250 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
1251
1252 ++$methylation_call_strings_processed; # 1 per single-end result
1253 }
1254 }
1255 } else { # processing single-end SAM format (default)
1256 while (<IN>) {
1257 ### SAM format can either start with header lines (starting with @) or start with alignments directly
1258 if (/^\@/) { # skipping header lines (starting with @)
1259 warn "skipping SAM header line:\t$_";
1260 next;
1261 }
1262
1263 ++$line_count;
1264 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
1265
1266 # example read in SAM format
1267 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
1268 ###
1269
1270 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
1271 # < 0.7.6 $meth_call =~ s/^XM:Z://;
1272 # < 0.7.6 $read_conversion =~ s/^XR:Z://;
1273 # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
1274
1275 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
1276
1277 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
1278 my $meth_call; ### Thanks to Zachary Zeno for this solution
1279 my $read_conversion;
1280 my $genome_conversion;
1281
1282 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
1283 my $tag = $1;
1284 my $value = $2;
1285
1286 if ($tag eq "XM") {
1287 $meth_call = $value;
1288 $meth_call =~ s/\r//;
1289 } elsif ($tag eq "XR") {
1290 $read_conversion = $value;
1291 $read_conversion =~ s/\r//;
1292 } elsif ($tag eq "XG") {
1293 $genome_conversion = $value;
1294 $genome_conversion =~ s/\r//;
1295 }
1296 }
1297
1298 my $strand;
1299 chomp $genome_conversion;
1300 # print "$meth_call\n$read_conversion\n$genome_conversion\n";
1301
1302 my $index;
1303 if ($meth_call) {
1304 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
1305 $index = 0;
1306 $strand = '+';
1307 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
1308 $index = 1;
1309 $strand = '-';
1310 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
1311 $index = 2;
1312 $strand = '+';
1313 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
1314 $index = 3;
1315 $strand = '-';
1316 } else {
1317 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
1318 }
1319
1320 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
1321 if ($strand eq '-') {
1322 $meth_call = reverse $meth_call;
1323 }
1324
1325 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
1326 if ($ignore) {
1327 # print "\n\n$meth_call\n";
1328 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
1329 # print "$meth_call\n";
1330 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
1331
1332 my @len = split (/\D+/,$cigar); # storing the length per operation
1333 my @ops = split (/\d+/,$cigar); # storing the operation
1334 shift @ops; # remove the empty first element
1335 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
1336
1337 my @comp_cigar; # building an array with all CIGAR operations
1338 foreach my $index (0..$#len) {
1339 foreach (1..$len[$index]) {
1340 # print "$ops[$index]";
1341 push @comp_cigar, $ops[$index];
1342 }
1343 }
1344 # print "original CIGAR: $cigar\n";
1345 # print "original CIGAR: @comp_cigar\n";
1346
1347 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
1348 if ($strand eq '+') {
1349
1350 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
1351 my $I_count = 0;
1352
1353 for (1..$ignore) {
1354 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
1355 # print "$_ deleted $op\n";
1356
1357 while ($op eq 'D') { # repeating this for deletions (D)
1358 $D_count++;
1359 $op = shift @comp_cigar;
1360 # print "$_ deleted $op\n";
1361 }
1362 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
1363 $I_count++;
1364 }
1365 }
1366 $start += $ignore + $D_count - $I_count;
1367 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
1368 } elsif ($strand eq '-') {
1369
1370 for (1..$ignore) {
1371 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
1372 while ($op eq 'D') { # repeating this for deletions (D)
1373 $op = pop @comp_cigar;
1374 }
1375 }
1376
1377 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
1378 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
1379 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
1380 foreach (@comp_cigar) {
1381 ++$MD_count if ($_ eq 'M' or $_ eq 'D');
1382 }
1383 $start += $MD_count - 1;
1384 }
1385
1386 ### reconstituting shortened CIGAR string
1387 my $new_cigar;
1388 my $count = 0;
1389 my $last_op;
1390 # print "ignore adjusted: @comp_cigar\n";
1391 foreach my $op (@comp_cigar) {
1392 unless (defined $last_op){
1393 $last_op = $op;
1394 ++$count;
1395 next;
1396 }
1397 if ($last_op eq $op) {
1398 ++$count;
1399 } else {
1400 $new_cigar .= "$count$last_op";
1401 $last_op = $op;
1402 $count = 1;
1403 }
1404 }
1405 $new_cigar .= "$count$last_op"; # appending the last operation and count
1406 $cigar = $new_cigar;
1407 # print "ignore adjusted scalar: $cigar\n";
1408 }
1409 }
1410 ### printing out the methylation state of every C in the read
1411 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
1412
1413 ++$methylation_call_strings_processed; # 1 per single-end result
1414 }
1415 }
1416 }
1417
1418 ### PROCESSING PAIRED-END RESULT FILES
1419 elsif ($paired) {
1420
1421 ### proceeding differently now for SAM format or vanilla Bismark format files
1422 if ($vanilla) { # old vanilla Bismark paired-end output format
1423 while (<IN>) {
1424 ++$line_count;
1425 warn "processed line: $line_count\n" if ($line_count%500000==0);
1426
1427 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
1428 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
1429
1430 my $index;
1431 chomp $genome_conversion;
1432
1433 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
1434 $index = 0; ## this is OT
1435 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
1436 $index = 2; ## this is CTOB!!!
1437 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
1438 $index = 1; ## this is CTOT!!!
1439 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
1440 $index = 3; ## this is OB
1441 } else {
1442 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
1443 }
1444
1445 if ($meth_call_1 and $meth_call_2) {
1446 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
1447 if ($ignore) {
1448 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
1449 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
1450
1451 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
1452 $start_read_1 += $ignore;
1453 $end_read_2 -= $ignore;
1454 }
1455 my $end_read_1;
1456 my $start_read_2;
1457
1458 if ($strand eq '+') {
1459
1460 $end_read_1 = $start_read_1+length($meth_call_1)-1;
1461 $start_read_2 = $end_read_2-length($meth_call_2)+1;
1462
1463 ## we first pass the first read which is in + orientation on the forward strand
1464 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0);
1465
1466 # we next pass the second read which is in - orientation on the reverse strand
1467 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
1468 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1);
1469 } else {
1470
1471 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
1472 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
1473
1474 ## we first pass the first read which is in - orientation on the reverse strand
1475 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0);
1476
1477 # we next pass the second read which is in + orientation on the forward strand
1478 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
1479 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2);
1480 }
1481
1482 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
1483 }
1484 }
1485 } else { # Bismark paired-end SAM output format (default)
1486 while (<IN>) {
1487 ### SAM format can either start with header lines (starting with @) or start with alignments directly
1488 if (/^\@/) { # skipping header lines (starting with @)
1489 warn "skipping SAM header line:\t$_";
1490 next;
1491 }
1492
1493 ++$line_count;
1494 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
1495
1496 # example paired-end reads in SAM format (2 consecutive lines)
1497 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
1498 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
1499
1500 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
1501
1502 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
1503 my $meth_call_1;
1504 my $first_read_conversion;
1505 my $genome_conversion;
1506
1507 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
1508 my $tag = $1;
1509 my $value = $2;
1510
1511 if ($tag eq "XM") {
1512 $meth_call_1 = $value;
1513 $meth_call_1 =~ s/\r//;
1514 } elsif ($tag eq "XR") {
1515 $first_read_conversion = $value;
1516 $first_read_conversion =~ s/\r//;
1517 } elsif ($tag eq "XG") {
1518 $genome_conversion = $value;
1519 $genome_conversion =~ s/\r//;
1520 }
1521 }
1522
1523 $_ = <IN>; # reading in the paired read
1524
1525 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14];
1526 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://;
1527 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://;
1528 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://;
1529 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://;
1530
1531 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
1532
1533 my $meth_call_2;
1534 my $second_read_conversion;
1535
1536 while ( /(XM|XR):Z:([^\t]+)/g ) {
1537 my $tag = $1;
1538 my $value = $2;
1539
1540 if ($tag eq "XM") {
1541 $meth_call_2 = $value;
1542 $meth_call_2 =~ s/\r//;
1543 } elsif ($tag eq "XR") {
1544 $second_read_conversion = $value;
1545 $second_read_conversion = s/\r//;
1546 }
1547 }
1548
1549 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
1550 chomp $genome_conversion; # in case it captured a new line character
1551
1552 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
1553
1554 my $index;
1555 my $strand;
1556
1557 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
1558 $index = 0; ## this is OT
1559 $strand = '+';
1560 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
1561 $index = 1; ## this is CTOT
1562 $strand = '-';
1563 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
1564 $index = 2; ## this is CTOB
1565 $strand = '+';
1566 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
1567 $index = 3; ## this is OB
1568 $strand = '-';
1569 } else {
1570 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
1571 }
1572
1573 ### reversing the methylation call of the read that was reverse-complemented
1574 if ($strand eq '+') {
1575 $meth_call_2 = reverse $meth_call_2;
1576 } else {
1577 $meth_call_1 = reverse $meth_call_1;
1578 }
1579
1580 if ($meth_call_1 and $meth_call_2) {
1581
1582 my $end_read_1;
1583
1584 ### READ 1
1585 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
1586 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
1587 shift @ops_1; # remove the empty first element
1588 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
1589
1590 my @comp_cigar_1; # building an array with all CIGAR operations
1591 foreach my $index (0..$#len_1) {
1592 foreach (1..$len_1[$index]) {
1593 # print "$ops_1[$index]";
1594 push @comp_cigar_1, $ops_1[$index];
1595 }
1596 }
1597 # print "original CIGAR read 1: $cigar_1\n";
1598 # print "original CIGAR read 1: @comp_cigar_1\n";
1599
1600 ### READ 2
1601 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
1602 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
1603 shift @ops_2; # remove the empty first element
1604 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
1605 my @comp_cigar_2; # building an array with all CIGAR operations for read 2
1606 foreach my $index (0..$#len_2) {
1607 foreach (1..$len_2[$index]) {
1608 # print "$ops_2[$index]";
1609 push @comp_cigar_2, $ops_2[$index];
1610 }
1611 }
1612 # print "original CIGAR read 2: $cigar_2\n";
1613 # print "original CIGAR read 2: @comp_cigar_2\n";
1614
1615 if ($ignore) {
1616 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
1617 ### the methylation calls have already been reversed where necessary
1618 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
1619 $meth_call_2 = substr($meth_call_2,$ignore,length($meth_call_2)-$ignore);
1620
1621 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
1622
1623 if ($strand eq '+') {
1624
1625 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
1626 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
1627 my $I_count_1 = 0;
1628
1629 for (1..$ignore) {
1630 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
1631 # print "$_ deleted $op\n";
1632
1633 while ($op eq 'D') { # repeating this for deletions (D)
1634 $D_count_1++;
1635 $op = shift @comp_cigar_1;
1636 # print "$_ deleted $op\n";
1637 }
1638 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
1639 $I_count_1++;
1640 }
1641 }
1642
1643 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
1644 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
1645
1646 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
1647
1648 for (1..$ignore) {
1649 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
1650 while ($op eq 'D') { # repeating this for deletions (D)
1651 $op = pop @comp_cigar_2;
1652 }
1653 }
1654 # the start position of reads mapping to the reverse strand is being adjusted further below
1655 } elsif ($strand eq '-') {
1656
1657 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
1658 for (1..$ignore) {
1659 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
1660 while ($op eq 'D') { # repeating this for deletions (D)
1661 $op = pop @comp_cigar_1;
1662 }
1663 }
1664 # the start position of reads mapping to the reverse strand is being adjusted further below
1665
1666 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
1667 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
1668 my $I_count_2 = 0;
1669
1670 for (1..$ignore) {
1671 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
1672 # print "$_ deleted $op\n";
1673
1674 while ($op eq 'D') { # repeating this for deletions (D)
1675 $D_count_2++;
1676 $op = shift @comp_cigar_2;
1677 # print "$_ deleted $op\n";
1678 }
1679 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
1680 $I_count_2++;
1681 }
1682 }
1683
1684 $start_read_2 += $ignore + $D_count_2 - $I_count_2;
1685 # print "start read 2 $start_read_2\t ignore: $ignore\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
1686
1687 }
1688
1689 ### reconstituting shortened CIGAR string 1
1690 my $new_cigar_1;
1691 my $count_1 = 0;
1692 my $last_op_1;
1693 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
1694 foreach my $op (@comp_cigar_1) {
1695 unless (defined $last_op_1){
1696 $last_op_1 = $op;
1697 ++$count_1;
1698 next;
1699 }
1700 if ($last_op_1 eq $op) {
1701 ++$count_1;
1702 } else {
1703 $new_cigar_1 .= "$count_1$last_op_1";
1704 $last_op_1 = $op;
1705 $count_1 = 1;
1706 }
1707 }
1708 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
1709 $cigar_1 = $new_cigar_1;
1710 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
1711
1712 ### reconstituting shortened CIGAR string 2
1713 my $new_cigar_2;
1714 my $count_2 = 0;
1715 my $last_op_2;
1716 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
1717 foreach my $op (@comp_cigar_2) {
1718 unless (defined $last_op_2){
1719 $last_op_2 = $op;
1720 ++$count_2;
1721 next;
1722 }
1723 if ($last_op_2 eq $op) {
1724 ++$count_2;
1725 } else {
1726 $new_cigar_2 .= "$count_2$last_op_2";
1727 $last_op_2 = $op;
1728 $count_2 = 1;
1729 }
1730 }
1731 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
1732 $cigar_2 = $new_cigar_2;
1733 # print "ignore adjusted CIGAR 2 scalar: $cigar_2\n";
1734
1735 }
1736
1737 if ($strand eq '+') {
1738 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
1739 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
1740 # print "reverse: @comp_cigar_2\n";
1741
1742 my $MD_count_1 = 0;
1743 foreach (@comp_cigar_1) {
1744 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
1745 }
1746
1747 my $MD_count_2 = 0;
1748 foreach (@comp_cigar_2) {
1749 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
1750 }
1751
1752 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
1753 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
1754 } else {
1755 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
1756
1757 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
1758 # print "reverse: @comp_cigar_1\n";
1759
1760 my $MD_count_1 = 0;
1761 foreach (@comp_cigar_1) {
1762 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
1763 }
1764
1765 $end_read_1 = $start_read_1;
1766 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
1767
1768 }
1769
1770 if ($strand eq '+') {
1771 ## we first pass the first read which is in + orientation on the forward strand
1772 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1);
1773
1774 # we next pass the second read which is in - orientation on the reverse strand
1775 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
1776 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2);
1777 } else {
1778 ## we first pass the first read which is in - orientation on the reverse strand
1779 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1);
1780
1781 # we next pass the second read which is in + orientation on the forward strand
1782 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
1783 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2);
1784 }
1785
1786 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
1787 }
1788 }
1789 }
1790 } else {
1791 die "Single-end or paired-end reads not specified properly\n";
1792 }
1793
1794 print "\n\nProcessed $line_count lines from $filename in total\n";
1795 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
1796 if ($report) {
1797 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
1798 }
1799 print_splitting_report ();
1800 }
1801
1802
1803
1804 sub print_splitting_report{
1805
1806 ### Calculating methylation percentages if applicable
1807
1808 my $percent_meCpG;
1809 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
1810 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
1811 }
1812
1813 my $percent_meCHG;
1814 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
1815 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
1816 }
1817
1818 my $percent_meCHH;
1819 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
1820 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
1821 }
1822
1823 my $percent_non_CpG_methylation;
1824 if ($merge_non_CpG){
1825 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
1826 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
1827 }
1828 }
1829
1830 if ($report){
1831 ### detailed information about Cs analysed
1832 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
1833
1834 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
1835 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
1836
1837 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
1838 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
1839 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
1840
1841 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
1842 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
1843 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
1844
1845 ### calculating methylated CpG percentage if applicable
1846 if ($percent_meCpG){
1847 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
1848 }
1849 else{
1850 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
1851 }
1852
1853 ### 2-Context Output
1854 if ($merge_non_CpG){
1855 if ($percent_non_CpG_methylation){
1856 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
1857 }
1858 else{
1859 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
1860 }
1861 }
1862
1863 ### 3 Context Output
1864 else{
1865 ### calculating methylated CHG percentage if applicable
1866 if ($percent_meCHG){
1867 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
1868 }
1869 else{
1870 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
1871 }
1872
1873 ### calculating methylated CHH percentage if applicable
1874 if ($percent_meCHH){
1875 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
1876 }
1877 else{
1878 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
1879 }
1880 }
1881 }
1882
1883 ### detailed information about Cs analysed for on-screen report
1884 print "Final Cytosine Methylation Report\n",'='x33,"\n";
1885
1886 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
1887 print "Total number of C's analysed:\t$total_number_of_C\n\n";
1888
1889 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
1890 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
1891 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
1892
1893 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
1894 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
1895 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
1896
1897 ### printing methylated CpG percentage if applicable
1898 if ($percent_meCpG){
1899 print "C methylated in CpG context:\t${percent_meCpG}%\n";
1900 }
1901 else{
1902 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
1903 }
1904
1905 ### 2-Context Output
1906 if ($merge_non_CpG){
1907 if ($percent_non_CpG_methylation){
1908 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
1909 }
1910 else{
1911 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
1912 }
1913 }
1914
1915 ### 3-Context Output
1916 else{
1917 ### printing methylated CHG percentage if applicable
1918 if ($percent_meCHG){
1919 print "C methylated in CHG context:\t${percent_meCHG}%\n";
1920 }
1921 else{
1922 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
1923 }
1924
1925 ### printing methylated CHH percentage if applicable
1926 if ($percent_meCHH){
1927 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
1928 }
1929 else{
1930 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
1931 }
1932 }
1933 }
1934
1935
1936
1937
1938
1939 sub print_individual_C_methylation_states_paired_end_files{
1940
1941 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar) = @_;
1942 my @methylation_calls = split(//,$meth_call);
1943
1944 #################################################################
1945 ### . for bases not involving cytosines ###
1946 ### X for methylated C in CHG context (was protected) ###
1947 ### x for not methylated C in CHG context (was converted) ###
1948 ### H for methylated C in CHH context (was protected) ###
1949 ### h for not methylated C in CHH context (was converted) ###
1950 ### Z for methylated C in CpG context (was protected) ###
1951 ### z for not methylated C in CpG context (was converted) ###
1952 #################################################################
1953
1954 my $methyl_CHG_count = 0;
1955 my $methyl_CHH_count = 0;
1956 my $methyl_CpG_count = 0;
1957 my $unmethylated_CHG_count = 0;
1958 my $unmethylated_CHH_count = 0;
1959 my $unmethylated_CpG_count = 0;
1960
1961
1962 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
1963 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
1964 my @comp_cigar;
1965
1966 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
1967 if ($cigar =~ /^\d+M$/){
1968 }
1969 else{ # parsing CIGAR string
1970 my @len;
1971 my @ops;
1972 @len = split (/\D+/,$cigar); # storing the length per operation
1973 @ops = split (/\d+/,$cigar); # storing the operation
1974 shift @ops; # remove the empty first element
1975 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
1976
1977 foreach my $index (0..$#len){
1978 foreach (1..$len[$index]){
1979 # print "$ops[$index]";
1980 push @comp_cigar, $ops[$index];
1981 }
1982 }
1983 # warn "\nDetected CIGAR string: $cigar\n";
1984 # warn "Length of methylation call: ",length $meth_call,"\n";
1985 # warn "number of operations: ",scalar @ops,"\n";
1986 # warn "number of length digits: ",scalar @len,"\n\n";
1987 # print @comp_cigar,"\n";
1988 # print "$meth_call\n\n";
1989 # sleep (1);
1990 }
1991
1992 if ($strand eq '-') {
1993
1994 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
1995 if (@comp_cigar){
1996 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
1997 }
1998 # print "reverse CIGAR string: @comp_cigar\n";
1999
2000 ### the start position of paired-end files has already been corrected, see above
2001 }
2002
2003 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
2004
2005 if ($merge_non_CpG) {
2006
2007 if ($no_overlap) {
2008
2009 ### single-file CpG and non-CpG context output
2010 if ($full) {
2011 if ($strand eq '+') {
2012 for my $index (0..$#methylation_calls) {
2013
2014 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2015 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2016 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2017 $cigar_offset += $cigar_mod;
2018 $pos_offset += $pos_mod;
2019 }
2020
2021 ### Returning as soon as the methylation calls start overlapping
2022 if ($start+$index+$pos_offset >= $end_read_1) {
2023 return;
2024 }
2025
2026 if ($methylation_calls[$index] eq 'X') {
2027 $counting{total_meCHG_count}++;
2028 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2029 } elsif ($methylation_calls[$index] eq 'x') {
2030 $counting{total_unmethylated_CHG_count}++;
2031 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2032 } elsif ($methylation_calls[$index] eq 'Z') {
2033 $counting{total_meCpG_count}++;
2034 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2035 } elsif ($methylation_calls[$index] eq 'z') {
2036 $counting{total_unmethylated_CpG_count}++;
2037 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2038 } elsif ($methylation_calls[$index] eq 'H') {
2039 $counting{total_meCHH_count}++;
2040 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2041 } elsif ($methylation_calls[$index] eq 'h') {
2042 $counting{total_unmethylated_CHH_count}++;
2043 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2044 }
2045 elsif ($methylation_calls[$index] eq '.'){}
2046 else{
2047 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2048 }
2049 }
2050 } elsif ($strand eq '-') {
2051 for my $index (0..$#methylation_calls) {
2052
2053 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2054 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2055 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2056 $cigar_offset += $cigar_mod;
2057 $pos_offset += $pos_mod;
2058 }
2059
2060 ### Returning as soon as the methylation calls start overlapping
2061 if ($start-$index+$pos_offset <= $end_read_1) {
2062 return;
2063 }
2064
2065 if ($methylation_calls[$index] eq 'X') {
2066 $counting{total_meCHG_count}++;
2067 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2068 } elsif ($methylation_calls[$index] eq 'x') {
2069 $counting{total_unmethylated_CHG_count}++;
2070 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2071 } elsif ($methylation_calls[$index] eq 'Z') {
2072 $counting{total_meCpG_count}++;
2073 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2074 } elsif ($methylation_calls[$index] eq 'z') {
2075 $counting{total_unmethylated_CpG_count}++;
2076 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2077 } elsif ($methylation_calls[$index] eq 'H') {
2078 $counting{total_meCHH_count}++;
2079 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2080 } elsif ($methylation_calls[$index] eq 'h') {
2081 $counting{total_unmethylated_CHH_count}++;
2082 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2083 }
2084 elsif ($methylation_calls[$index] eq '.') {}
2085 else{
2086 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2087 }
2088 }
2089 } else {
2090 die "The read orientation was neither + nor -: '$strand'\n";
2091 }
2092 }
2093
2094 ### strand-specific methylation output
2095 else {
2096 if ($strand eq '+') {
2097 for my $index (0..$#methylation_calls) {
2098
2099 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2100 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2101 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2102 $cigar_offset += $cigar_mod;
2103 $pos_offset += $pos_mod;
2104 }
2105
2106 ### Returning as soon as the methylation calls start overlapping
2107 if ($start+$index+$pos_offset >= $end_read_1) {
2108 return;
2109 }
2110
2111 if ($methylation_calls[$index] eq 'X') {
2112 $counting{total_meCHG_count}++;
2113 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2114 } elsif ($methylation_calls[$index] eq 'x') {
2115 $counting{total_unmethylated_CHG_count}++;
2116 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2117 } elsif ($methylation_calls[$index] eq 'Z') {
2118 $counting{total_meCpG_count}++;
2119 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2120 } elsif ($methylation_calls[$index] eq 'z') {
2121 $counting{total_unmethylated_CpG_count}++;
2122 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2123 } elsif ($methylation_calls[$index] eq 'H') {
2124 $counting{total_meCHH_count}++;
2125 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2126 } elsif ($methylation_calls[$index] eq 'h') {
2127 $counting{total_unmethylated_CHH_count}++;
2128 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2129 }
2130 elsif ($methylation_calls[$index] eq '.') {}
2131 else{
2132 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2133 }
2134 }
2135 } elsif ($strand eq '-') {
2136 for my $index (0..$#methylation_calls) {
2137
2138 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2139 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2140 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2141 $cigar_offset += $cigar_mod;
2142 $pos_offset += $pos_mod;
2143 }
2144
2145 ### Returning as soon as the methylation calls start overlapping
2146 if ($start-$index+$pos_offset <= $end_read_1) {
2147 return;
2148 }
2149
2150 if ($methylation_calls[$index] eq 'X') {
2151 $counting{total_meCHG_count}++;
2152 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2153 } elsif ($methylation_calls[$index] eq 'x') {
2154 $counting{total_unmethylated_CHG_count}++;
2155 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2156 } elsif ($methylation_calls[$index] eq 'Z') {
2157 $counting{total_meCpG_count}++;
2158 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2159 } elsif ($methylation_calls[$index] eq 'z') {
2160 $counting{total_unmethylated_CpG_count}++;
2161 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2162 } elsif ($methylation_calls[$index] eq 'H') {
2163 $counting{total_meCHH_count}++;
2164 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2165 } elsif ($methylation_calls[$index] eq 'h') {
2166 $counting{total_unmethylated_CHH_count}++;
2167 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2168 }
2169 elsif ($methylation_calls[$index] eq '.') {}
2170 else{
2171 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2172 }
2173 }
2174 } else {
2175 die "The strand orientation was neither + nor -: '$strand'/n";
2176 }
2177 }
2178 }
2179
2180 ### this is the default paired-end procedure allowing overlaps and using every single C position
2181 ### Still within the 2-CONTEXT ONLY optional section
2182 else {
2183 ### single-file CpG and non-CpG context output
2184 if ($full) {
2185 if ($strand eq '+') {
2186 for my $index (0..$#methylation_calls) {
2187
2188 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2189 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2190 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2191 $cigar_offset += $cigar_mod;
2192 $pos_offset += $pos_mod;
2193 }
2194
2195 if ($methylation_calls[$index] eq 'X') {
2196 $counting{total_meCHG_count}++;
2197 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2198 } elsif ($methylation_calls[$index] eq 'x') {
2199 $counting{total_unmethylated_CHG_count}++;
2200 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2201 } elsif ($methylation_calls[$index] eq 'Z') {
2202 $counting{total_meCpG_count}++;
2203 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2204 } elsif ($methylation_calls[$index] eq 'z') {
2205 $counting{total_unmethylated_CpG_count}++;
2206 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2207 } elsif ($methylation_calls[$index] eq 'H') {
2208 $counting{total_meCHH_count}++;
2209 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2210 } elsif ($methylation_calls[$index] eq 'h') {
2211 $counting{total_unmethylated_CHH_count}++;
2212 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2213 }
2214 elsif ($methylation_calls[$index] eq '.') {}
2215 else{
2216 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2217 }
2218 }
2219 } elsif ($strand eq '-') {
2220 for my $index (0..$#methylation_calls) {
2221
2222 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2223 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2224 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2225 $cigar_offset += $cigar_mod;
2226 $pos_offset += $pos_mod;
2227 }
2228
2229 if ($methylation_calls[$index] eq 'X') {
2230 $counting{total_meCHG_count}++;
2231 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2232 } elsif ($methylation_calls[$index] eq 'x') {
2233 $counting{total_unmethylated_CHG_count}++;
2234 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2235 } elsif ($methylation_calls[$index] eq 'Z') {
2236 $counting{total_meCpG_count}++;
2237 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2238 } elsif ($methylation_calls[$index] eq 'z') {
2239 $counting{total_unmethylated_CpG_count}++;
2240 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2241 } elsif ($methylation_calls[$index] eq 'H') {
2242 $counting{total_meCHH_count}++;
2243 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2244 } elsif ($methylation_calls[$index] eq 'h') {
2245 $counting{total_unmethylated_CHH_count}++;
2246 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2247 }
2248 elsif ($methylation_calls[$index] eq '.') {}
2249 else{
2250 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2251 }
2252 }
2253 } else {
2254 die "The strand orientation as neither + nor -: '$strand'\n";
2255 }
2256 }
2257
2258 ### strand-specific methylation output
2259 ### still within the 2-CONTEXT optional section
2260 else {
2261 if ($strand eq '+') {
2262 for my $index (0..$#methylation_calls) {
2263
2264 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2265 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2266 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2267 $cigar_offset += $cigar_mod;
2268 $pos_offset += $pos_mod;
2269 }
2270
2271 if ($methylation_calls[$index] eq 'X') {
2272 $counting{total_meCHG_count}++;
2273 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2274 } elsif ($methylation_calls[$index] eq 'x') {
2275 $counting{total_unmethylated_CHG_count}++;
2276 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2277 } elsif ($methylation_calls[$index] eq 'Z') {
2278 $counting{total_meCpG_count}++;
2279 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2280 } elsif ($methylation_calls[$index] eq 'z') {
2281 $counting{total_unmethylated_CpG_count}++;
2282 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2283 } elsif ($methylation_calls[$index] eq 'H') {
2284 $counting{total_meCHH_count}++;
2285 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2286 } elsif ($methylation_calls[$index] eq 'h') {
2287 $counting{total_unmethylated_CHH_count}++;
2288 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2289 }
2290 elsif ($methylation_calls[$index] eq '.') {}
2291 else{
2292 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2293 }
2294 }
2295 } elsif ($strand eq '-') {
2296 for my $index (0..$#methylation_calls) {
2297
2298 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2299 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2300 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2301 $cigar_offset += $cigar_mod;
2302 $pos_offset += $pos_mod;
2303 }
2304
2305 if ($methylation_calls[$index] eq 'X') {
2306 $counting{total_meCHG_count}++;
2307 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2308 } elsif ($methylation_calls[$index] eq 'x') {
2309 $counting{total_unmethylated_CHG_count}++;
2310 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2311 } elsif ($methylation_calls[$index] eq 'Z') {
2312 $counting{total_meCpG_count}++;
2313 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2314 } elsif ($methylation_calls[$index] eq 'z') {
2315 $counting{total_unmethylated_CpG_count}++;
2316 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2317 } elsif ($methylation_calls[$index] eq 'H') {
2318 $counting{total_meCHH_count}++;
2319 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2320 } elsif ($methylation_calls[$index] eq 'h') {
2321 $counting{total_unmethylated_CHH_count}++;
2322 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2323 }
2324 elsif ($methylation_calls[$index] eq '.') {}
2325 else{
2326 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2327 }
2328 }
2329 } else {
2330 die "The strand orientation as neither + nor -: '$strand'\n";
2331 }
2332 }
2333 }
2334 }
2335
2336 ############################################
2337 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
2338 ############################################
2339
2340 elsif ($no_overlap) {
2341 ### single-file CpG, CHG and CHH context output
2342 if ($full) {
2343 if ($strand eq '+') {
2344 for my $index (0..$#methylation_calls) {
2345
2346 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2347 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2348 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2349 $cigar_offset += $cigar_mod;
2350 $pos_offset += $pos_mod;
2351 }
2352
2353 ### Returning as soon as the methylation calls start overlapping
2354 if ($start+$index+$pos_offset >= $end_read_1) {
2355 return;
2356 }
2357
2358 if ($methylation_calls[$index] eq 'X') {
2359 $counting{total_meCHG_count}++;
2360 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2361 } elsif ($methylation_calls[$index] eq 'x') {
2362 $counting{total_unmethylated_CHG_count}++;
2363 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2364 } elsif ($methylation_calls[$index] eq 'Z') {
2365 $counting{total_meCpG_count}++;
2366 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2367 } elsif ($methylation_calls[$index] eq 'z') {
2368 $counting{total_unmethylated_CpG_count}++;
2369 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2370 } elsif ($methylation_calls[$index] eq 'H') {
2371 $counting{total_meCHH_count}++;
2372 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2373 } elsif ($methylation_calls[$index] eq 'h') {
2374 $counting{total_unmethylated_CHH_count}++;
2375 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2376 }
2377 elsif ($methylation_calls[$index] eq '.') {}
2378 else{
2379 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2380 }
2381 }
2382 } elsif ($strand eq '-') {
2383 for my $index (0..$#methylation_calls) {
2384
2385 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2386 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2387 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2388 $cigar_offset += $cigar_mod;
2389 $pos_offset += $pos_mod;
2390 }
2391
2392 ### Returning as soon as the methylation calls start overlapping
2393 if ($start-$index+$pos_offset <= $end_read_1) {
2394 return;
2395 }
2396
2397 if ($methylation_calls[$index] eq 'X') {
2398 $counting{total_meCHG_count}++;
2399 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2400 } elsif ($methylation_calls[$index] eq 'x') {
2401 $counting{total_unmethylated_CHG_count}++;
2402 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2403 } elsif ($methylation_calls[$index] eq 'Z') {
2404 $counting{total_meCpG_count}++;
2405 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2406 } elsif ($methylation_calls[$index] eq 'z') {
2407 $counting{total_unmethylated_CpG_count}++;
2408 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2409 } elsif ($methylation_calls[$index] eq 'H') {
2410 $counting{total_meCHH_count}++;
2411 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2412 } elsif ($methylation_calls[$index] eq 'h') {
2413 $counting{total_unmethylated_CHH_count}++;
2414 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2415 }
2416 elsif ($methylation_calls[$index] eq '.') {}
2417 else{
2418 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2419 }
2420 }
2421 } else {
2422 die "The strand orientation as neither + nor -: '$strand'\n";
2423 }
2424 }
2425
2426 ### strand-specific methylation output
2427 else {
2428 if ($strand eq '+') {
2429 for my $index (0..$#methylation_calls) {
2430
2431 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2432 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2433 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2434 $cigar_offset += $cigar_mod;
2435 $pos_offset += $pos_mod;
2436 }
2437
2438 ### Returning as soon as the methylation calls start overlapping
2439 if ($start+$index+$pos_offset >= $end_read_1) {
2440 return;
2441 }
2442
2443 if ($methylation_calls[$index] eq 'X') {
2444 $counting{total_meCHG_count}++;
2445 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2446 } elsif ($methylation_calls[$index] eq 'x') {
2447 $counting{total_unmethylated_CHG_count}++;
2448 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2449 } elsif ($methylation_calls[$index] eq 'Z') {
2450 $counting{total_meCpG_count}++;
2451 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2452 } elsif ($methylation_calls[$index] eq 'z') {
2453 $counting{total_unmethylated_CpG_count}++;
2454 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2455 } elsif ($methylation_calls[$index] eq 'H') {
2456 $counting{total_meCHH_count}++;
2457 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2458 } elsif ($methylation_calls[$index] eq 'h') {
2459 $counting{total_unmethylated_CHH_count}++;
2460 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2461 }
2462 elsif ($methylation_calls[$index] eq '.') {}
2463 else{
2464 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2465 }
2466 }
2467 } elsif ($strand eq '-') {
2468 for my $index (0..$#methylation_calls) {
2469
2470 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2471 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2472 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2473 $cigar_offset += $cigar_mod;
2474 $pos_offset += $pos_mod;
2475 }
2476
2477 ### Returning as soon as the methylation calls start overlapping
2478 if ($start-$index+$pos_offset <= $end_read_1) {
2479 return;
2480 }
2481
2482 if ($methylation_calls[$index] eq 'X') {
2483 $counting{total_meCHG_count}++;
2484 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2485 } elsif ($methylation_calls[$index] eq 'x') {
2486 $counting{total_unmethylated_CHG_count}++;
2487 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2488 } elsif ($methylation_calls[$index] eq 'Z') {
2489 $counting{total_meCpG_count}++;
2490 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2491 } elsif ($methylation_calls[$index] eq 'z') {
2492 $counting{total_unmethylated_CpG_count}++;
2493 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2494 } elsif ($methylation_calls[$index] eq 'H') {
2495 $counting{total_meCHH_count}++;
2496 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2497 } elsif ($methylation_calls[$index] eq 'h') {
2498 $counting{total_unmethylated_CHH_count}++;
2499 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2500 }
2501 elsif ($methylation_calls[$index] eq '.') {}
2502 else{
2503 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2504 }
2505 }
2506 } else {
2507 die "The strand orientation as neither + nor -: '$strand'\n";
2508 }
2509 }
2510 }
2511
2512 ### this is the default paired-end procedure allowing overlaps and using every single C position
2513 else {
2514 ### single-file CpG, CHG and CHH context output
2515 if ($full) {
2516 if ($strand eq '+') {
2517 for my $index (0..$#methylation_calls) {
2518
2519 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2520 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2521 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2522 $cigar_offset += $cigar_mod;
2523 $pos_offset += $pos_mod;
2524 }
2525
2526 if ($methylation_calls[$index] eq 'X') {
2527 $counting{total_meCHG_count}++;
2528 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2529 } elsif ($methylation_calls[$index] eq 'x') {
2530 $counting{total_unmethylated_CHG_count}++;
2531 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2532 } elsif ($methylation_calls[$index] eq 'Z') {
2533 $counting{total_meCpG_count}++;
2534 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2535 } elsif ($methylation_calls[$index] eq 'z') {
2536 $counting{total_unmethylated_CpG_count}++;
2537 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2538 } elsif ($methylation_calls[$index] eq 'H') {
2539 $counting{total_meCHH_count}++;
2540 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2541 } elsif ($methylation_calls[$index] eq 'h') {
2542 $counting{total_unmethylated_CHH_count}++;
2543 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2544 }
2545 elsif ($methylation_calls[$index] eq '.') {}
2546 else{
2547 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2548 }
2549 }
2550 } elsif ($strand eq '-') {
2551 for my $index (0..$#methylation_calls) {
2552
2553 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2554 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2555 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2556 $cigar_offset += $cigar_mod;
2557 $pos_offset += $pos_mod;
2558 }
2559
2560 if ($methylation_calls[$index] eq 'X') {
2561 $counting{total_meCHG_count}++;
2562 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2563 } elsif ($methylation_calls[$index] eq 'x') {
2564 $counting{total_unmethylated_CHG_count}++;
2565 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2566 } elsif ($methylation_calls[$index] eq 'Z') {
2567 $counting{total_meCpG_count}++;
2568 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2569 } elsif ($methylation_calls[$index] eq 'z') {
2570 $counting{total_unmethylated_CpG_count}++;
2571 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2572 } elsif ($methylation_calls[$index] eq 'H') {
2573 $counting{total_meCHH_count}++;
2574 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2575 } elsif ($methylation_calls[$index] eq 'h') {
2576 $counting{total_unmethylated_CHH_count}++;
2577 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2578 }
2579 elsif ($methylation_calls[$index] eq '.') {}
2580 else{
2581 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2582 }
2583 }
2584 } else {
2585 die "The strand orientation as neither + nor -: '$strand'\n";
2586 }
2587 }
2588
2589 ### strand-specific methylation output
2590 else {
2591 if ($strand eq '+') {
2592 for my $index (0..$#methylation_calls) {
2593
2594 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2595 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2596 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2597 $cigar_offset += $cigar_mod;
2598 $pos_offset += $pos_mod;
2599 }
2600
2601 if ($methylation_calls[$index] eq 'X') {
2602 $counting{total_meCHG_count}++;
2603 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2604 } elsif ($methylation_calls[$index] eq 'x') {
2605 $counting{total_unmethylated_CHG_count}++;
2606 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2607 } elsif ($methylation_calls[$index] eq 'Z') {
2608 $counting{total_meCpG_count}++;
2609 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2610 } elsif ($methylation_calls[$index] eq 'z') {
2611 $counting{total_unmethylated_CpG_count}++;
2612 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2613 } elsif ($methylation_calls[$index] eq 'H') {
2614 $counting{total_meCHH_count}++;
2615 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2616 } elsif ($methylation_calls[$index] eq 'h') {
2617 $counting{total_unmethylated_CHH_count}++;
2618 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2619 }
2620 elsif ($methylation_calls[$index] eq '.') {}
2621 else{
2622 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2623 }
2624 }
2625 } elsif ($strand eq '-') {
2626 for my $index (0..$#methylation_calls) {
2627
2628 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2629 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2630 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2631 $cigar_offset += $cigar_mod;
2632 $pos_offset += $pos_mod;
2633 }
2634
2635 if ($methylation_calls[$index] eq 'X') {
2636 $counting{total_meCHG_count}++;
2637 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2638 } elsif ($methylation_calls[$index] eq 'x') {
2639 $counting{total_unmethylated_CHG_count}++;
2640 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2641 } elsif ($methylation_calls[$index] eq 'Z') {
2642 $counting{total_meCpG_count}++;
2643 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2644 } elsif ($methylation_calls[$index] eq 'z') {
2645 $counting{total_unmethylated_CpG_count}++;
2646 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2647 } elsif ($methylation_calls[$index] eq 'H') {
2648 $counting{total_meCHH_count}++;
2649 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2650 } elsif ($methylation_calls[$index] eq 'h') {
2651 $counting{total_unmethylated_CHH_count}++;
2652 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2653 }
2654 elsif ($methylation_calls[$index] eq '.') {}
2655 else{
2656 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2657 }
2658 }
2659 } else {
2660 die "The strand orientation as neither + nor -: '$strand'\n";
2661 }
2662 }
2663 }
2664 }
2665
2666 sub check_cigar_string {
2667 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
2668 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
2669 my ($new_cigar_offset,$new_pos_offset) = (0,0);
2670
2671 if ($strand eq '+') {
2672 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
2673
2674 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2675 # warn "position needs no adjustment\n";
2676 }
2677
2678 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
2679 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
2680 # warn "adjusted genomic position by -1 bp (insertion)\n";
2681 }
2682
2683 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2684 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2685 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
2686 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
2687
2688 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
2689 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2690 # warn "position needs no adjustment\n";
2691 last;
2692 }
2693 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
2694 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
2695 # warn "adjusted genomic position by another -1 bp (insertion)\n";
2696 last;
2697 }
2698 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2699 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2700 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
2701 # warn "adjusted genomic position by another +1 bp (deletion)\n";
2702 }
2703 else{
2704 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2705 }
2706 }
2707 }
2708 else{
2709 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2710 }
2711 }
2712
2713 elsif ($strand eq '-') {
2714 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
2715
2716 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2717 # warn "position needs no adjustment\n";
2718 }
2719
2720 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
2721 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
2722 # warn "adjusted genomic position by +1 bp (insertion)\n";
2723 }
2724
2725 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2726 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2727 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
2728 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
2729
2730 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
2731 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
2732 # warn "Found new 'M' operation; position needs no adjustment\n";
2733 last;
2734 }
2735 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
2736 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
2737 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
2738 last;
2739 }
2740 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
2741 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
2742 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
2743 # warn "adjusted genomic position by another -1 bp (deletion)\n";
2744 }
2745 else{
2746 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2747 }
2748 }
2749 }
2750 else{
2751 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
2752 }
2753 }
2754 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
2755 return ($new_cigar_offset,$new_pos_offset);
2756 }
2757
2758 sub print_individual_C_methylation_states_single_end{
2759
2760 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
2761 my @methylation_calls = split(//,$meth_call);
2762
2763 #################################################################
2764 ### . for bases not involving cytosines ###
2765 ### X for methylated C in CHG context (was protected) ###
2766 ### x for not methylated C in CHG context (was converted) ###
2767 ### H for methylated C in CHH context (was protected) ###
2768 ### h for not methylated C in CHH context (was converted) ###
2769 ### Z for methylated C in CpG context (was protected) ###
2770 ### z for not methylated C in CpG context (was converted) ###
2771 #################################################################
2772
2773 my $methyl_CHG_count = 0;
2774 my $methyl_CHH_count = 0;
2775 my $methyl_CpG_count = 0;
2776 my $unmethylated_CHG_count = 0;
2777 my $unmethylated_CHH_count = 0;
2778 my $unmethylated_CpG_count = 0;
2779
2780 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
2781 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
2782
2783 my @comp_cigar;
2784
2785 if ($cigar){ # parsing CIGAR string
2786
2787 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
2788 if ($cigar =~ /^\d+M$/){
2789 # warn "See!? I told you so! $cigar\n";
2790 # sleep(1);
2791 }
2792 else{
2793
2794 my @len;
2795 my @ops;
2796
2797 @len = split (/\D+/,$cigar); # storing the length per operation
2798 @ops = split (/\d+/,$cigar); # storing the operation
2799 shift @ops; # remove the empty first element
2800 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
2801
2802 foreach my $index (0..$#len){
2803 foreach (1..$len[$index]){
2804 # print "$ops[$index]";
2805 push @comp_cigar, $ops[$index];
2806 }
2807 }
2808 }
2809 # warn "\nDetected CIGAR string: $cigar\n";
2810 # warn "Length of methylation call: ",length $meth_call,"\n";
2811 # warn "number of operations: ",scalar @ops,"\n";
2812 # warn "number of length digits: ",scalar @len,"\n\n";
2813 # print @comp_cigar,"\n";
2814 # print "$meth_call\n\n";
2815 # sleep (1);
2816 }
2817
2818 ### adjusting the start position for all reads mapping to the reverse strand
2819 if ($strand eq '-') {
2820
2821 if (@comp_cigar){ # only needed for SAM reads with InDels
2822 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2823 # print @comp_cigar,"\n";
2824 }
2825
2826 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
2827
2828 if ($cigar){ ### SAM format
2829 if ($cigar =~ /^(\d+)M$/){ # linear match
2830 $start += $1 - 1;
2831 }
2832 else{ # InDel read
2833 my $MD_count = 0;
2834 foreach (@comp_cigar){
2835 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
2836 }
2837 $start += $MD_count - 1;
2838 }
2839 }
2840 else{ ### vanilla format
2841 $start += length($meth_call)-1;
2842 }
2843 }
2844 }
2845
2846 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
2847
2848 ### single-file CpG and other-context output
2849 if ($full and $merge_non_CpG) {
2850 if ($strand eq '+') {
2851 for my $index (0..$#methylation_calls) {
2852
2853 if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels
2854 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2855 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2856 $cigar_offset += $cigar_mod;
2857 $pos_offset += $pos_mod;
2858 }
2859
2860 ### methylated Cs (any context) will receive a forward (+) orientation
2861 ### not methylated Cs (any context) will receive a reverse (-) orientation
2862 if ($methylation_calls[$index] eq 'X') {
2863 $counting{total_meCHG_count}++;
2864 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2865 }
2866 elsif ($methylation_calls[$index] eq 'x') {
2867 $counting{total_unmethylated_CHG_count}++;
2868 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2869 }
2870 elsif ($methylation_calls[$index] eq 'Z') {
2871 $counting{total_meCpG_count}++;
2872 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2873 }
2874 elsif ($methylation_calls[$index] eq 'z') {
2875 $counting{total_unmethylated_CpG_count}++;
2876 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2877 }
2878 elsif ($methylation_calls[$index] eq 'H') {
2879 $counting{total_meCHH_count}++;
2880 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2881 }
2882 elsif ($methylation_calls[$index] eq 'h') {
2883 $counting{total_unmethylated_CHH_count}++;
2884 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2885 }
2886 elsif ($methylation_calls[$index] eq '.') {
2887 }
2888 else{
2889 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2890 }
2891 }
2892 }
2893 elsif ($strand eq '-') {
2894
2895 for my $index (0..$#methylation_calls) {
2896 ### methylated Cs (any context) will receive a forward (+) orientation
2897 ### not methylated Cs (any context) will receive a reverse (-) orientation
2898
2899 if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels
2900 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2901 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2902 $cigar_offset += $cigar_mod;
2903 $pos_offset += $pos_mod;
2904 }
2905
2906 if ($methylation_calls[$index] eq 'X') {
2907 $counting{total_meCHG_count}++;
2908 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2909 }
2910 elsif ($methylation_calls[$index] eq 'x') {
2911 $counting{total_unmethylated_CHG_count}++;
2912 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2913 }
2914 elsif ($methylation_calls[$index] eq 'Z') {
2915 $counting{total_meCpG_count}++;
2916 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2917 }
2918 elsif ($methylation_calls[$index] eq 'z') {
2919 $counting{total_unmethylated_CpG_count}++;
2920 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2921 }
2922 elsif ($methylation_calls[$index] eq 'H') {
2923 $counting{total_meCHH_count}++;
2924 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2925 }
2926 elsif ($methylation_calls[$index] eq 'h') {
2927 $counting{total_unmethylated_CHH_count}++;
2928 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
2929 }
2930 elsif ($methylation_calls[$index] eq '.'){
2931 }
2932 else{
2933 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2934 }
2935 }
2936 }
2937 else {
2938 die "The strand information was neither + nor -: $strand\n";
2939 }
2940 }
2941
2942 ### strand-specific methylation output
2943 elsif ($merge_non_CpG) {
2944 if ($strand eq '+') {
2945 for my $index (0..$#methylation_calls) {
2946 ### methylated Cs (any context) will receive a forward (+) orientation
2947 ### not methylated Cs (any context) will receive a reverse (-) orientation
2948
2949 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
2950 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2951 $cigar_offset += $cigar_mod;
2952 $pos_offset += $pos_mod;
2953 }
2954
2955 if ($methylation_calls[$index] eq 'X') {
2956 $counting{total_meCHG_count}++;
2957 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2958 }
2959 elsif ($methylation_calls[$index] eq 'x') {
2960 $counting{total_unmethylated_CHG_count}++;
2961 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2962 }
2963 elsif ($methylation_calls[$index] eq 'Z') {
2964 $counting{total_meCpG_count}++;
2965 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2966 }
2967 elsif ($methylation_calls[$index] eq 'z') {
2968 $counting{total_unmethylated_CpG_count}++;
2969 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2970 }
2971 elsif ($methylation_calls[$index] eq 'H') {
2972 $counting{total_meCHH_count}++;
2973 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2974 }
2975 elsif ($methylation_calls[$index] eq 'h') {
2976 $counting{total_unmethylated_CHH_count}++;
2977 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
2978 }
2979 elsif ($methylation_calls[$index] eq '.') {
2980 }
2981 else{
2982 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2983 }
2984 }
2985 }
2986 elsif ($strand eq '-') {
2987
2988 for my $index (0..$#methylation_calls) {
2989 ### methylated Cs (any context) will receive a forward (+) orientation
2990 ### not methylated Cs (any context) will receive a reverse (-) orientation
2991
2992 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
2993 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2994 $cigar_offset += $cigar_mod;
2995 $pos_offset += $pos_mod;
2996 }
2997
2998 if ($methylation_calls[$index] eq 'X') {
2999 $counting{total_meCHG_count}++;
3000 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3001 }
3002 elsif ($methylation_calls[$index] eq 'x') {
3003 $counting{total_unmethylated_CHG_count}++;
3004 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3005 }
3006 elsif ($methylation_calls[$index] eq 'Z') {
3007 $counting{total_meCpG_count}++;
3008 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3009 }
3010 elsif ($methylation_calls[$index] eq 'z') {
3011 $counting{total_unmethylated_CpG_count}++;
3012 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3013 }
3014 elsif ($methylation_calls[$index] eq 'H') {
3015 $counting{total_meCHH_count}++;
3016 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3017 }
3018 elsif ($methylation_calls[$index] eq 'h') {
3019 $counting{total_unmethylated_CHH_count}++;
3020 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3021 }
3022 elsif ($methylation_calls[$index] eq '.') {
3023 }
3024 else{
3025 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3026 }
3027 }
3028 }
3029 else {
3030 die "The strand information was neither + nor -: $strand\n";
3031 }
3032 }
3033
3034 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
3035
3036 elsif ($full) {
3037 if ($strand eq '+') {
3038 for my $index (0..$#methylation_calls) {
3039 ### methylated Cs (any context) will receive a forward (+) orientation
3040 ### not methylated Cs (any context) will receive a reverse (-) orientation
3041
3042 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
3043 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3044 $cigar_offset += $cigar_mod;
3045 $pos_offset += $pos_mod;
3046 }
3047
3048 if ($methylation_calls[$index] eq 'X') {
3049 $counting{total_meCHG_count}++;
3050 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
3051 } elsif ($methylation_calls[$index] eq 'x') {
3052 $counting{total_unmethylated_CHG_count}++;
3053 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
3054 } elsif ($methylation_calls[$index] eq 'Z') {
3055 $counting{total_meCpG_count}++;
3056 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
3057 } elsif ($methylation_calls[$index] eq 'z') {
3058 $counting{total_unmethylated_CpG_count}++;
3059 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
3060 } elsif ($methylation_calls[$index] eq 'H') {
3061 $counting{total_meCHH_count}++;
3062 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
3063 } elsif ($methylation_calls[$index] eq 'h') {
3064 $counting{total_unmethylated_CHH_count}++;
3065 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
3066 }
3067 elsif ($methylation_calls[$index] eq '.') {}
3068 else{
3069 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3070 }
3071 }
3072 }
3073 elsif ($strand eq '-') {
3074
3075 for my $index (0..$#methylation_calls) {
3076 ### methylated Cs (any context) will receive a forward (+) orientation
3077 ### not methylated Cs (any context) will receive a reverse (-) orientation
3078
3079 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
3080 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3081 $cigar_offset += $cigar_mod;
3082 $pos_offset += $pos_mod;
3083 }
3084
3085 if ($methylation_calls[$index] eq 'X') {
3086 $counting{total_meCHG_count}++;
3087 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3088 } elsif ($methylation_calls[$index] eq 'x') {
3089 $counting{total_unmethylated_CHG_count}++;
3090 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3091 } elsif ($methylation_calls[$index] eq 'Z') {
3092 $counting{total_meCpG_count}++;
3093 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3094 } elsif ($methylation_calls[$index] eq 'z') {
3095 $counting{total_unmethylated_CpG_count}++;
3096 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3097 } elsif ($methylation_calls[$index] eq 'H') {
3098 $counting{total_meCHH_count}++;
3099 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3100 } elsif ($methylation_calls[$index] eq 'h') {
3101 $counting{total_unmethylated_CHH_count}++;
3102 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3103 }
3104 elsif ($methylation_calls[$index] eq '.') {}
3105 else{
3106 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3107 }
3108 }
3109 }
3110 else {
3111 die "The read had a strand orientation which was neither + nor -: $strand\n";
3112 }
3113 }
3114
3115 ### strand-specific methylation output
3116 else {
3117 if ($strand eq '+') {
3118 for my $index (0..$#methylation_calls) {
3119 ### methylated Cs (any context) will receive a forward (+) orientation
3120 ### not methylated Cs (any context) will receive a reverse (-) orientation
3121
3122 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
3123 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3124 $cigar_offset += $cigar_mod;
3125 $pos_offset += $pos_mod;
3126 }
3127
3128 if ($methylation_calls[$index] eq 'X') {
3129 $counting{total_meCHG_count}++;
3130 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
3131 } elsif ($methylation_calls[$index] eq 'x') {
3132 $counting{total_unmethylated_CHG_count}++;
3133 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
3134 } elsif ($methylation_calls[$index] eq 'Z') {
3135 $counting{total_meCpG_count}++;
3136 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
3137 } elsif ($methylation_calls[$index] eq 'z') {
3138 $counting{total_unmethylated_CpG_count}++;
3139 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
3140 } elsif ($methylation_calls[$index] eq 'H') {
3141 $counting{total_meCHH_count}++;
3142 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
3143 } elsif ($methylation_calls[$index] eq 'h') {
3144 $counting{total_unmethylated_CHH_count}++;
3145 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n";
3146 }
3147 elsif ($methylation_calls[$index] eq '.') {}
3148 else{
3149 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3150 }
3151 }
3152 }
3153 elsif ($strand eq '-') {
3154
3155 for my $index (0..$#methylation_calls) {
3156 ### methylated Cs (any context) will receive a forward (+) orientation
3157 ### not methylated Cs (any context) will receive a reverse (-) orientation
3158
3159 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
3160 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3161 $cigar_offset += $cigar_mod;
3162 $pos_offset += $pos_mod;
3163 }
3164
3165 if ($methylation_calls[$index] eq 'X') {
3166 $counting{total_meCHG_count}++;
3167 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3168 } elsif ($methylation_calls[$index] eq 'x') {
3169 $counting{total_unmethylated_CHG_count}++;
3170 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3171 } elsif ($methylation_calls[$index] eq 'Z') {
3172 $counting{total_meCpG_count}++;
3173 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3174 } elsif ($methylation_calls[$index] eq 'z') {
3175 $counting{total_unmethylated_CpG_count}++;
3176 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3177 } elsif ($methylation_calls[$index] eq 'H') {
3178 $counting{total_meCHH_count}++;
3179 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3180 } elsif ($methylation_calls[$index] eq 'h') {
3181 $counting{total_unmethylated_CHH_count}++;
3182 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n";
3183 }
3184 elsif ($methylation_calls[$index] eq '.') {}
3185 else{
3186 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3187 }
3188 }
3189 }
3190 else {
3191 die "The strand information was neither + nor -: $strand\n";
3192 }
3193 }
3194 }
3195
3196
3197
3198 #######################################################################################################################################
3199 ### bismark2bedGaph section - START
3200 #######################################################################################################################################
3201
3202 ### has now been moved to the external script bismark2bedGraph
3203
3204 # sub process_bedGraph_output{
3205 # warn "="x64,"\n";
3206 # warn "Methylation information will now be written into a bedGraph file\n";
3207 # warn "="x64,"\n\n";
3208 # sleep (2);
3209
3210 # ### Closing all filehandles so that the Bismark methtylation extractor output doesn't get truncated due to buffering issues
3211 # foreach my $fh (keys %fhs) {
3212 # if ($fh =~ /^[1230]$/) {
3213 # foreach my $context (keys %{$fhs{$fh}}) {
3214 # close $fhs{$fh}->{$context} or die $!;
3215 # }
3216 # } else {
3217 # close $fhs{$fh} or die $!;
3218 # }
3219 # }
3220
3221 # ### deciding which files to use for bedGraph conversion
3222 # foreach my $filename (@sorting_files){
3223 # # warn "$filename\n";
3224 # if ($filename =~ /\//){ # if files are in a different output folder we extract the filename again
3225 # $filename =~ s/.*\///; # replacing everything up to the last slash in the filename
3226 # # warn "$filename\n";
3227 # }
3228
3229 # if ($CX_context){
3230 # push @bedfiles,$filename;
3231 # }
3232 # else{ ## CpG context only (default)
3233 # if ($filename =~ /^CpG_/){
3234 # push @bedfiles,$filename;
3235 # }
3236 # else{
3237 # # skipping CHH or CHG files
3238 # }
3239 # }
3240 # }
3241
3242 # warn "Using the following files as Input:\n";
3243 # print join ("\t",@bedfiles),"\n\n";
3244 # sleep (2);
3245
3246 # my %temp_fhs;
3247 # my @temp_files; # writing all context files (default CpG only) to these files prior to sorting
3248
3249 # ### changing to the output directory
3250 # unless ($output_dir eq ''){ # default
3251 # chdir $output_dir or die "Failed to change directory to $output_dir\n";
3252 # warn "Changed directory to $output_dir\n";
3253 # }
3254
3255 # foreach my $infile (@bedfiles) {
3256
3257 # if ($remove) {
3258 # warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n";
3259
3260 # if ($infile =~ /gz$/){
3261 # open (READ,"zcat $infile |") or die $!;
3262 # }
3263 # else{
3264 # open (READ,$infile) or die $!;
3265 # }
3266
3267 # my $removed_spaces_outfile = $infile;
3268 # $removed_spaces_outfile =~ s/$/.spaces_removed.txt/;
3269
3270 # open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n";
3271
3272 # unless ($no_header){
3273 # $_ = <READ>; ### Bismark version header
3274 # print REM $_; ### Bismark version header
3275 # }
3276
3277 # while (<READ>) {
3278 # chomp;
3279 # my ($id,$strand,$chr,$pos,$context) = (split (/\t/));
3280 # $id =~ s/\s+/_/g;
3281 # print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n";
3282 # }
3283
3284 # close READ or die $!;
3285 # close REM or die $!;
3286
3287 # ### changing the infile name to the new file without spaces
3288 # $infile = $removed_spaces_outfile;
3289 # }
3290
3291 # warn "Now writing methylation information for file $infile to individual files for each chromosome\n";
3292 # if ($infile =~ /gz$/){
3293 # open (IN,"zcat $infile |") or die $!;
3294 # }
3295 # else{
3296 # open (IN,$infile) or die $!;
3297 # }
3298
3299 # ## always ignoring the version header
3300 # unless ($no_header){
3301 # $_ = <IN>; ### Bismark version header
3302 # }
3303
3304 # while (<IN>) {
3305 # chomp;
3306 # my ($chr) = (split (/\t/))[2];
3307 # # warn "This is the chromosome name before replacing '|' characters:\t$chr\n\n";
3308 # $chr =~ s/\|/_/g; # replacing pipe ('|') characters in the file names
3309 # # warn "This is the chromosome name AFTER replacing '|' characters:\t$chr\n\n";
3310
3311 # unless (exists $temp_fhs{$chr}) {
3312 # open ($temp_fhs{$chr},'>','chr'.$chr.'.meth_extractor.temp') or die "Failed to open filehandle: $!";
3313 # }
3314 # print {$temp_fhs{$chr}} "$_\n";
3315 # }
3316
3317 # warn "Finished writing out individual chromosome files for $infile\n";
3318 # }
3319 # warn "\n";
3320
3321 # @temp_files = <*.meth_extractor.temp>;
3322
3323 # warn "Collecting temporary chromosome file information...\n";
3324 # sleep (1);
3325 # warn "processing the following input file(s):\n";
3326 # warn join ("\n",@temp_files),"\n\n";
3327 # sleep (1);
3328
3329 # foreach my $in (@temp_files) {
3330 # if ($sort_size){
3331 # warn "Sorting input file $in by positions (using -S of '$sort_size')\n";
3332 # }
3333 # else{
3334 # warn "Sorting input file $in by positions (using default memory settings)\n";
3335 # }
3336 # my $sort_dir = $output_dir;
3337 # if ($sort_dir eq ''){
3338 # $sort_dir = './';
3339 # }
3340 # open my $ifh, "sort -S $sort_size -T $sort_dir -k3,3 -k4,4n $in |" or die "Input file could not be sorted. $!";
3341 # # print "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n";
3342
3343 # ############################################# m.a.bentley - moved the variables out of the while loop to hold the current line data {
3344
3345 # my $name;
3346 # my $meth_state;
3347 # my $chr = "";
3348 # my $pos = 0;
3349 # my $meth_state2;
3350
3351 # my $last_pos;
3352 # my $last_chr;
3353
3354 # ############################################# }
3355
3356 # while (my $line = <$ifh>) {
3357 # next if $line =~ /^Bismark/;
3358 # chomp $line;
3359
3360 # ########################################### m.a.bentley - (1) set the last_chr and last_pos variables early in the while loop, before the line split (2) removed unnecessary setting of same variables in if statement {
3361
3362 # $last_chr = $chr;
3363 # $last_pos = $pos;
3364 # ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
3365
3366 # if (($last_pos ne $pos) || ($last_chr ne $chr)) {
3367 # generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
3368 # @methylcalls = qw (0 0 0);
3369 # }
3370
3371 # ############################################# }
3372
3373 # my $validated = validate_methylation_call($meth_state, $meth_state2);
3374 # unless($validated){
3375 # warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
3376 # next;
3377 # }
3378 # if ($meth_state eq "+") {
3379 # $methylcalls[0]++;
3380 # $methylcalls[2]++;
3381 # } else {
3382 # $methylcalls[1]++;
3383 # $methylcalls[2]++;
3384 # }
3385 # }
3386
3387 # ############################################# m.a.bentley - set the last_chr and last_pos variables for the last line in the file (outside the while loop's scope using the method i've implemented) {
3388
3389 # $last_chr = $chr;
3390 # $last_pos = $pos;
3391 # if ($methylcalls[2] > 0) {
3392 # generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
3393 # }
3394 # ############################################# }
3395
3396 # close $ifh or die $!;
3397
3398 # @methylcalls = qw (0 0 0); # resetting @methylcalls
3399
3400 # ### deleting temporary files
3401 # my $delete = unlink $in;
3402 # if ($delete) {
3403 # warn "Successfully deleted the temporary input file $in\n\n";
3404 # }
3405 # else {
3406 # warn "The temporary inputfile $in could not be deleted $!\n\n";
3407 # }
3408 # }
3409 # }
3410
3411 # sub generate_output{
3412 # my $methcount = $methylcalls[0];
3413 # my $nonmethcount = $methylcalls[1];
3414 # my $totalcount = $methylcalls[2];
3415 # my $last_chr = shift;
3416 # my $last_pos = shift;
3417 # croak "Should not be generating output if there's no reads to this region" unless $totalcount > 0;
3418 # croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if $totalcount != ($methcount + $nonmethcount);
3419
3420 # ############################################# m.a.bentley - declare a new variable 'bed_pos' to distinguish from bismark positions (-1) - previous scripts modified the last_pos variable earlier in the script leading to problems in meth % calculation {
3421
3422 # my $bed_pos = $last_pos -1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based.
3423 # my $meth_percentage;
3424 # ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef);
3425 # # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/;
3426 # if (defined $meth_percentage){
3427 # if ($counts){
3428 # print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
3429 # }
3430 # else{
3431 # print OUT "$last_chr\t$bed_pos\t$bed_pos\t$meth_percentage\n";
3432 # }
3433 # }
3434 # ############################################# }
3435 # }
3436
3437 # sub validate_methylation_call{
3438 # my $meth_state = shift;
3439 # croak "Missing (+/-) methylation call" unless defined $meth_state;
3440 # my $meth_state2 = shift;
3441 # croak "Missing alphabetical methylation call" unless defined $meth_state2;
3442 # my $is_consistent;
3443 # ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2))
3444 # : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2));
3445 # return 1 if $is_consistent;
3446 # return 0;
3447 # }
3448
3449 # sub check_CpG_methylation_call{
3450 # my $meth1 = shift;
3451 # my $meth2 = shift;
3452 # return 1 if($meth1 eq "+" && $meth2 eq "Z");
3453 # return 1 if($meth1 eq "-" && $meth2 eq "z");
3454 # return 0;
3455 # }
3456
3457 # sub check_nonCpG_methylation_call{
3458 # my $meth1 = shift;
3459 # my $meth2 = shift;
3460 # return 1 if($meth1 eq "+" && $meth2 eq "C");
3461 # return 1 if($meth1 eq "+" && $meth2 eq "X");
3462 # return 1 if($meth1 eq "+" && $meth2 eq "H");
3463 # return 1 if($meth1 eq "-" && $meth2 eq "c");
3464 # return 1 if($meth1 eq "-" && $meth2 eq "x");
3465 # return 1 if($meth1 eq "-" && $meth2 eq "h");
3466 # return 0;
3467 # }
3468
3469 #######################################################################################################################################
3470 ### bismark2bedGaph section - END
3471 #######################################################################################################################################
3472
3473
3474
3475
3476
3477
3478 # #######################################################################################################################################
3479 # ### genome-wide cytosine methylation report - START
3480 # #######################################################################################################################################
3481
3482 ### has now been moved to the external script bedGraph2cytosine
3483
3484 # sub generate_genome_wide_cytosine_report {
3485
3486 # warn "="x78,"\n";
3487 # warn "Methylation information will now be written into a genome-wide cytosine report\n";
3488 # warn "="x78,"\n\n";
3489 # sleep (2);
3490
3491 # ### changing to the output directory again
3492 # unless ($output_dir eq ''){ # default
3493 # chdir $output_dir or die "Failed to change directory to $output_dir\n";
3494 # # warn "Changed directory to $output_dir\n";
3495 # }
3496
3497 # my $in = shift;
3498 # open (IN,$in) or die $!;
3499
3500 # my $cytosine_out = shift;
3501
3502 # if ($CX_context){
3503 # $cytosine_out =~ s/$/genome-wide_CX_report.txt/;
3504 # }
3505 # else{
3506 # $cytosine_out =~ s/$/genome_wide_CpG_report.txt/;
3507 # }
3508
3509 # ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands
3510 # unless ($split_by_chromosome){ ### writing all output to a single file (default)
3511 # open (CYT,'>',$cytosine_out) or die $!;
3512 # warn "Writing genome-wide cytosine report to: $cytosine_out\n";
3513 # sleep (3);
3514 # }
3515
3516 # my $last_chr;
3517 # my %chr; # storing reads for one chromosome at a time
3518
3519 # my $count = 0;
3520 # while (<IN>){
3521 # chomp;
3522 # ++$count;
3523 # my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/);
3524
3525 # # defining the first chromosome
3526 # unless (defined $last_chr){
3527 # $last_chr = $chr;
3528 # # warn "Storing all covered cytosine positions for chromosome: $chr\n";
3529 # }
3530
3531 # if ($chr eq $last_chr){
3532 # $chr{$chr}->{$start}->{meth} = $meth;
3533 # $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
3534 # }
3535 # else{
3536 # warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
3537
3538 # if ($split_by_chromosome){ ## writing output to 1 file per chromosome
3539 # my $chromosome_out = $cytosine_out;
3540 # $chromosome_out =~ s/txt$/chr${last_chr}.txt/;
3541 # open (CYT,'>',$chromosome_out) or die $!;
3542 # }
3543
3544 # while ( $chromosomes{$last_chr} =~ /([CG])/g){
3545
3546 # my $tri_nt = '';
3547 # my $context = '';
3548 # my $pos = pos$chromosomes{$last_chr};
3549
3550 # my $strand;
3551 # my $meth = 0;
3552 # my $nonmeth = 0;
3553
3554 # if ($1 eq 'C'){ # C on forward strand
3555 # $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
3556 # $strand = '+';
3557 # }
3558 # elsif ($1 eq 'G'){ # C on reverse strand
3559 # $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
3560 # $tri_nt = reverse $tri_nt;
3561 # $tri_nt =~ tr/ACTG/TGAC/;
3562 # $strand = '-';
3563 # }
3564 # next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
3565
3566 # if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based!
3567 # $meth = $chr{$last_chr}->{$pos-1}->{meth};
3568 # $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth};
3569 # }
3570
3571 # ### determining cytosine context
3572 # if ($tri_nt =~ /^CG/){
3573 # $context = 'CG';
3574 # }
3575 # elsif ($tri_nt =~ /^C.{1}G$/){
3576 # $context = 'CHG';
3577 # }
3578 # elsif ($tri_nt =~ /^C.{2}$/){
3579 # $context = 'CHH';
3580 # }
3581 # else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
3582 # warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n";
3583 # next;
3584 # }
3585
3586 # if ($CpG_only){
3587 # if ($tri_nt =~ /^CG/){ # CpG context is the default
3588 # if ($zero){ # zero based coordinates
3589 # $pos -= 1;
3590 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3591 # }
3592 # else{ # default
3593 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3594 # }
3595 # }
3596 # }
3597 # else{ ## all cytosines, specified with --CX
3598 # if ($zero){ # zero based coordinates
3599 # $pos -= 1;
3600 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3601 # }
3602 # else{ # default
3603 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3604 # }
3605 # }
3606 # }
3607
3608 # %chr = (); # resetting the hash
3609
3610 # # new first entry
3611 # $last_chr = $chr;
3612 # $chr{$chr}->{$start}->{meth} = $meth;
3613 # $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
3614 # }
3615 # }
3616
3617 # # Last found chromosome
3618 # warn "Writing cytosine reports for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
3619
3620 # if ($split_by_chromosome){ ## writing output to 1 file per chromosome
3621 # my $chromosome_out = $cytosine_out;
3622 # $chromosome_out =~ s/txt$/chr${last_chr}.txt/;
3623 # open (CYT,'>',$chromosome_out) or die $!;
3624 # }
3625
3626 # while ( $chromosomes{$last_chr} =~ /([CG])/g){
3627
3628 # my $tri_nt;
3629 # my $context;
3630 # my $pos = pos$chromosomes{$last_chr};
3631
3632 # my $strand;
3633 # my $meth = 0;
3634 # my $nonmeth = 0;
3635
3636 # if ($1 eq 'C'){ # C on forward strand
3637 # $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
3638 # $strand = '+';
3639 # }
3640 # elsif ($1 eq 'G'){ # C on reverse strand
3641 # $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
3642 # $tri_nt = reverse $tri_nt;
3643 # $tri_nt =~ tr/ACTG/TGAC/;
3644 # $strand = '-';
3645 # }
3646
3647 # if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 0-based!
3648 # $meth = $chr{$last_chr}->{$pos-1}->{meth};
3649 # $nonmeth = $chr{$last_chr}->{$pos-1}->{nonmeth};
3650 # }
3651
3652 # next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
3653
3654 # ### determining cytosine context
3655 # if ($tri_nt =~ /^CG/){
3656 # $context = 'CG';
3657 # }
3658 # elsif ($tri_nt =~ /^C.{1}G$/){
3659 # $context = 'CHG';
3660 # }
3661 # elsif ($tri_nt =~ /^C.{2}$/){
3662 # $context = 'CHH';
3663 # }
3664 # else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
3665 # warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n";
3666 # next;
3667 # }
3668
3669 # if ($CpG_only){
3670 # if ($tri_nt =~ /^CG/){ # CpG context is the default
3671 # if ($zero){ # zero-based coordinates
3672 # $pos -= 1;
3673 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3674 # }
3675 # else{ # default
3676 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3677 # }
3678 # }
3679 # }
3680 # else{ ## all cytosines, specified with --CX
3681 # if ($zero){ # zero based coordinates
3682 # $pos -= 1;
3683 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3684 # }
3685 # else{ # default
3686 # print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
3687 # }
3688 # }
3689 # }
3690 # close CYT or die $!;
3691 # }
3692
3693
3694 # sub read_genome_into_memory{
3695
3696 # ## reading in and storing the specified genome in the %chromosomes hash
3697 # chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
3698 # warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
3699
3700 # my @chromosome_filenames = <*.fa>;
3701
3702 # ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
3703 # unless (@chromosome_filenames){
3704 # @chromosome_filenames = <*.fasta>;
3705 # }
3706 # unless (@chromosome_filenames){
3707 # die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
3708 # }
3709
3710 # foreach my $chromosome_filename (@chromosome_filenames){
3711
3712 # # skipping the tophat entire mouse genome fasta file
3713 # next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa');
3714
3715 # open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
3716 # ### first line needs to be a fastA header
3717 # my $first_line = <CHR_IN>;
3718 # chomp $first_line;
3719 # $first_line =~ s/\r//; # removing /r carriage returns
3720
3721 # ### Extracting chromosome name from the FastA header
3722 # my $chromosome_name = extract_chromosome_name($first_line);
3723
3724 # my $sequence;
3725 # while (<CHR_IN>){
3726 # chomp;
3727 # $_ =~ s/\r//; # removing /r carriage returns
3728
3729 # if ($_ =~ /^>/){
3730 # ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
3731 # if (exists $chromosomes{$chromosome_name}){
3732 # warn "chr $chromosome_name (",length $sequence ," bp)\n";
3733 # die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
3734 # }
3735 # else {
3736 # if (length($sequence) == 0){
3737 # warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
3738 # }
3739 # warn "chr $chromosome_name (",length $sequence ," bp)\n";
3740 # $chromosomes{$chromosome_name} = $sequence;
3741 # }
3742 # ### resetting the sequence variable
3743 # $sequence = '';
3744 # ### setting new chromosome name
3745 # $chromosome_name = extract_chromosome_name($_);
3746 # }
3747 # else{
3748 # $sequence .= uc$_;
3749 # }
3750 # }
3751
3752 # if (exists $chromosomes{$chromosome_name}){
3753 # warn "chr $chromosome_name (",length $sequence ," bp)\t";
3754 # die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
3755 # }
3756 # else{
3757 # if (length($sequence) == 0){
3758 # warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
3759 # }
3760 # warn "chr $chromosome_name (",length $sequence ," bp)\n";
3761 # $chromosomes{$chromosome_name} = $sequence;
3762 # }
3763 # }
3764 # warn "\n";
3765 # chdir $parent_dir or die "Failed to move to directory $parent_dir\n";
3766 # }
3767
3768 # sub extract_chromosome_name {
3769 # ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well
3770 # my $fasta_header = shift;
3771 # if ($fasta_header =~ s/^>//){
3772 # my ($chromosome_name) = split (/\s+/,$fasta_header);
3773 # return $chromosome_name;
3774 # }
3775 # else{
3776 # die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
3777 # }
3778 # }
3779
3780 # #######################################################################################################################################
3781 # ### genome-wide cytosine methylation report - END
3782 # #######################################################################################################################################
3783
3784
3785
3786
3787 sub print_helpfile{
3788
3789 print << 'HOW_TO';
3790
3791
3792 DESCRIPTION
3793
3794 The following is a brief description of all options to control the Bismark
3795 methylation extractor. The script reads in a bisulfite read alignment results file
3796 produced by the Bismark bisulfite mapper and extracts the methylation information
3797 for individual cytosines. This information is found in the methylation call field
3798 which can contain the following characters:
3799
3800 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3801 ~~~ X for methylated C in CHG context (was protected) ~~~
3802 ~~~ x for not methylated C CHG (was converted) ~~~
3803 ~~~ H for methylated C in CHH context (was protected) ~~~
3804 ~~~ h for not methylated C in CHH context (was converted) ~~~
3805 ~~~ Z for methylated C in CpG context (was protected) ~~~
3806 ~~~ z for not methylated C in CpG context (was converted) ~~~
3807 ~~~ . for any bases not involving cytosines ~~~
3808 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3809
3810 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
3811 context (this distinction is actually already made in Bismark itself). As the methylation
3812 information for every C analysed can produce files which easily have tens or even hundreds of
3813 millions of lines, file sizes can become very large and more difficult to handle. The C
3814 methylation info additionally splits cytosine methylation calls up into one of the four possible
3815 strands a given bisulfite read aligned against:
3816
3817 OT original top strand
3818 CTOT complementary to original top strand
3819
3820 OB original bottom strand
3821 CTOB complementary to original bottom strand
3822
3823 Thus, by default twelve individual output files are being generated per input file (unless
3824 --comprehensive is specified, see below). The output files can be imported into a genome
3825 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
3826 unless the bisulfite reads were generated preserving directionality it doesn't make any
3827 sense to look at the data in a strand-specific manner). Strand-specific output files can
3828 optionally be skipped, in which case only three output files for CpG, CHG or CHH context
3829 will be generated. For both the strand-specific and comprehensive outputs there is also
3830 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
3831
3832
3833 The output files are in the following format (tab delimited):
3834
3835 <sequence_id> <strand> <chromosome> <position> <methylation call>
3836
3837
3838 USAGE: methylation_extractor [options] <filenames>
3839
3840
3841 ARGUMENTS:
3842
3843 <filenames> A space-separated list of Bismark result files in SAM format from
3844 which methylation information is extracted for every cytosine in
3845 the reads. For alignment files in the older custom Bismark output
3846 see option '--vanilla'.
3847
3848 OPTIONS:
3849
3850 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end
3851 read data. Specifying either --single-end or --paired-end is
3852 mandatory.
3853
3854 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
3855 read data. Specifying either --paired-end or --single-end is
3856 mandatory.
3857
3858 --vanilla The Bismark result input file(s) are in the old custom Bismark format
3859 (up to version 0.5.x) and not in SAM format which is the default as
3860 of Bismark version 0.6.x or higher. Default: OFF.
3861
3862 --no_overlap For paired-end reads it is theoretically possible that read_1 and
3863 read_2 overlap. This option avoids scoring overlapping methylation
3864 calls twice (only methylation calls of read 1 are used for in the process
3865 since read 1 has historically higher quality basecalls than read 2).
3866 Whilst this option removes a bias towards more methylation calls
3867 in the center of sequenced fragments it may de facto remove a sizable
3868 proportion of the data. This option is highly recommended for paired-end
3869 data.
3870
3871 --ignore <int> Ignore the first <int> bp at the 5' end of each read when processing the
3872 methylation call string. This can remove e.g. a restriction enzyme site
3873 at the start of each read.
3874
3875 --comprehensive Specifying this option will merge all four possible strand-specific
3876 methylation info into context-dependent output files. The default
3877 contexts are:
3878 - CpG context
3879 - CHG context
3880 - CHH context
3881
3882 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
3883 strand-specific output files (default) for Cs in
3884 - CpG context
3885 - non-CpG context
3886
3887 --report Prints out a short methylation summary as well as the paramaters used to run
3888 this script.
3889
3890 --no_header Suppresses the Bismark version header line in all output files for more convenient
3891 batch processing.
3892
3893 -o/--output DIR Allows specification of a different output directory (absolute or relative
3894 path). If not specified explicitely, the output will be written to the current directory.
3895
3896 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
3897 explicitly if Samtools is in the PATH already.
3898
3899 --gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in
3900 a GZIP compressed form to save disk space. This option does not work on bedGraph and
3901 genome-wide cytosine reports as they are 'tiny' anyway.
3902
3903 --version Displays version information.
3904
3905 -h/--help Displays this help file and exits.
3906
3907
3908
3909 bedGraph specific options:
3910
3911 --bedGraph After finishing the methylation extraction, the methylation output is written into a
3912 sorted bedGraph file that reports the position of a given cytosine and its methylation
3913 state (in %, see details below). The methylation extractor output is temporarily split up into
3914 temporary files, one per chromosome (written into the current directory or folder
3915 specified with -o/--output); these temp files are then used for sorting and deleted
3916 afterwards. By default, only cytosines in CpG context will be sorted. The option
3917 '--CX_context' may be used to report all cytosines irrespective of sequence context
3918 (this will take MUCH longer!). The default folder for temporary files during the sorting
3919 process is the output directory. The bedGraph conversion step is performed by the external
3920 module 'bismark2bedGraph'; this script needs to reside in the same folder as the
3921 bismark_methylation_extractor itself.
3922
3923
3924 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide
3925 before its methylation percentage is reported. Default: 1.
3926
3927 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
3928
3929
3930 --counts Adds two additional columns to the output file to enable further calculations:
3931 col 5: number of methylated calls
3932 col 6: number of unmethylated calls
3933 This option is required if '--cytosine_report' is specified (and will be set automatically if
3934 necessary).
3935
3936 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
3937 in the experiment irrespective of its sequence context. This applies to both forward and
3938 reverse strands. Please be aware that this option may generate large temporary and output files
3939 and may take a long time to sort (up to many hours). Default: OFF.
3940 (i.e. Default = CpG context only).
3941
3942 --buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information.
3943 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
3944 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
3945 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
3946 Defaults to 2G.
3947
3948
3949 Genome-wide cytosine methylation report specific options:
3950
3951 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a
3952 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based
3953 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all
3954 cytosine context is optional). The output considers all Cs on both forward and reverse strands and
3955 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not
3956 covered). The cytsoine report conversion step is performed by the external module
3957 'bedGraph2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor
3958 itself.
3959
3960 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of
3961 its context. This applies to both forward and reverse strands. Please be aware that this will
3962 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
3963 Default: OFF (i.e. Default = CpG context only).
3964
3965 --zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF.
3966
3967 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
3968 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
3969
3970 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files
3971 will be named to include the input filename and the chromosome number.
3972
3973
3974
3975 OUTPUT:
3976
3977 The bismark_methylation_extractor output is in the form:
3978 ========================================================
3979 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call>
3980
3981 * Methylated cytosines receive a '+' orientation,
3982 * Unmethylated cytosines receive a '-' orientation.
3983
3984
3985
3986 The bedGraph output (optional) looks like this (tab-delimited):
3987 ===============================================================
3988 <chromosome> <start position> <end position> <methylation percentage>
3989
3990 The bedGraph output with '--counts' specified looks like this (tab-delimited):
3991
3992 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated>
3993
3994
3995
3996 The genome-wide cytosine methylation output file is tab-delimited in the following format:
3997 ==========================================================================================
3998 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context>
3999
4000
4001
4002 This script was last modified on 21 April 2013.
4003
4004 HOW_TO
4005 }