comparison bismark_methyl_extractor/bismark_methylation_extractor @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
comparison
equal deleted inserted replaced
6:0f8646f22b8d 7:fcadce4d9a06
1 #!/usr/bin/env perl
2 use warnings;
3 use strict;
4 $|++;
5 use Getopt::Long;
6 use Cwd;
7 use Carp;
8 use FindBin qw($Bin);
9 use lib "$Bin/../lib";
10
11 ## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk)
12
13 ## This program is free software: you can redistribute it and/or modify
14 ## it under the terms of the GNU General Public License as published by
15 ## the Free Software Foundation, either version 3 of the License, or
16 ## (at your option) any later version.
17
18 ## This program is distributed in the hope that it will be useful,
19 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ## GNU General Public License for more details.
22
23 ## You should have received a copy of the GNU General Public License
24 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
25
26 my @filenames; # input files
27 my %counting;
28 my $parent_dir = getcwd();
29
30 my %fhs;
31
32 my $version = 'v0.16.3';
33 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore) = process_commandline();
34
35
36 ### only needed for bedGraph output
37 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files
38 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
39 my @bedfiles;
40
41 ### only needed for genome-wide cytosine methylation report
42 my %chromosomes;
43
44 my %mbias_1;
45 my %mbias_2;
46
47
48 ##############################################################################################
49 ### Summarising Run Parameters
50 ##############################################################################################
51
52 ### METHYLATION EXTRACTOR
53
54 warn "Summarising Bismark methylation extractor parameters:\n";
55 warn '='x63,"\n";
56
57 if ($single){
58 if ($vanilla){
59 warn "Bismark single-end vanilla format specified\n";
60 }
61 else{
62 warn "Bismark single-end SAM format specified (default)\n"; # default
63 }
64 }
65 elsif ($paired){
66 if ($vanilla){
67 warn "Bismark paired-end vanilla format specified\n";
68 }
69 else{
70 warn "Bismark paired-end SAM format specified (default)\n"; # default
71 }
72 }
73
74 warn "Number of cores to be used: $multicore\n";
75
76 if ($single){
77 if ($ignore){
78 warn "First $ignore bp will be disregarded when processing the methylation call string\n";
79 }
80 if ($ignore_3prime){
81 warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string\n";
82 }
83
84 }
85 else{ ## paired-end
86 if ($ignore){
87 warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n";
88 }
89 if ($ignore_r2){
90 warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n";
91 }
92
93 if ($ignore_3prime){
94 warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string of Read 1\n";
95 }
96 if ($ignore_3prime_r2){
97 warn "Last $ignore_3prime_r2 bp will be disregarded when processing the methylation call string of Read 2\n";
98 }
99
100
101 }
102
103
104 if ($full){
105 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
106 }
107 if ($merge_non_CpG){
108 warn "Merge CHG and CHH context to non-CpG context specified\n";
109 }
110 ### output directory
111 if ($output_dir eq ''){
112 warn "Output will be written to the current directory ('$parent_dir')\n";
113 }
114 else{
115 warn "Output path specified as: $output_dir\n";
116 }
117
118
119 sleep (1);
120
121 ### BEDGRAPH
122
123 if ($bedGraph){
124 warn "\n\nSummarising bedGraph parameters:\n";
125 warn '='x63,"\n";
126
127 if ($counts){
128 warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage>\ncoverage format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>\n\n";
129 }
130 else{
131 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n";
132 }
133
134 ### Zero-based coordinates
135 if ($zero){
136 warn "Writing out an additional coverage file (ending in zero.cov) with 0-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n";
137 }
138
139 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n";
140
141 if ($CX_context){
142 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n";
143 }
144 else{ # default
145 $CpG_only = 1;
146 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n";
147 }
148
149 if ($remove){
150 warn "White spaces in read ID names will be removed prior to sorting\n";
151 }
152
153 if ($ample_mem){
154 warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n";
155 }
156 elsif (defined $sort_size){
157 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n";
158 }
159 else{
160 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n";
161 }
162
163
164
165 sleep (1);
166
167 if ($cytosine_report){
168 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n";
169 warn '='x63,"\n";
170 warn "Generating comprehensive genome-wide cytosine report\n(output format: <Chromosome> <Position> <Strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> )\n";
171
172
173 if ($CX_context){
174 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n";
175 }
176 else{ # default
177 $CpG_only = 1;
178 warn "Reporting cytosine methylation in CpG context only (default)\n";
179 }
180
181 if ($split_by_chromosome){
182 warn "Splitting the cytosine report output up into individual files for each chromosome\n";
183 }
184
185 ### Zero-based coordinates
186 if ($zero){
187 warn "Using zero-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n";
188 }
189 else{ # default, 1-based coords
190 warn "Using 1-based genomic coordinates (default)\n";
191 }
192
193 ### GENOME folder
194 if ($genome_folder){
195 unless ($genome_folder =~/\/$/){
196 $genome_folder =~ s/$/\//;
197 }
198 warn "Genome folder was specified as $genome_folder\n";
199 }
200 else{
201 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/';
202 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n";
203 }
204 sleep (1);
205 }
206 }
207
208 warn "\n";
209 sleep (1);
210
211 ######################################################
212 ### PROCESSING FILES
213 ######################################################
214
215 foreach my $filename (@filenames){
216 # resetting counters and filehandles
217 %fhs = ();
218 %counting =(
219 total_meCHG_count => 0,
220 total_meCHH_count => 0,
221 total_meCpG_count => 0,
222 total_unmethylated_CHG_count => 0,
223 total_unmethylated_CHH_count => 0,
224 total_unmethylated_CpG_count => 0,
225 sequences_count => 0,
226 methylation_call_strings => 0,
227 );
228
229 @sorting_files = ();
230 @bedfiles = ();
231
232 %mbias_1 = ();
233 %mbias_2 = ();
234
235
236 ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor
237 unless ($vanilla){
238 if ($paired){
239 test_positional_sorting($filename);
240 }
241 }
242
243 my ($pid,$pids,$report_basename) = process_Bismark_results_file($filename);
244
245 if ($pid == 0){
246 warn "Finished processing child process. Exiting..\n";
247
248 # ### Closing all filehandles of the child process so that the Bismark methylation extractor output doesn't get truncated due to buffering issues
249 # foreach my $fh (keys %fhs) {
250 # if ($fh =~ /^[1230]$/) {
251 # foreach my $context (keys %{$fhs{$fh}}) {
252 # $fhs{$fh}->{$context}->flush;
253 # }
254 # }
255 # else{
256 # $fhs{$fh}->flush;
257 # }
258 # }
259 exit 0;
260 }
261
262 ###
263 if ($pid and $multicore > 1){
264 warn "Now waiting for all child processes to complete\n";
265 sleep(1);
266
267 ### we need to ensure that we wait for all child processes to be finished before continuing
268 # warn "here are the child IDs: @$pids\n";
269 # warn "Looping through the child process IDs:\n";
270
271 foreach my $id (@$pids){
272 # print "$id\t";
273 my $kid = waitpid ($id,0);
274 # print "Returned: $kid\nExit status: $?\n";
275 unless ($? == 0){
276 warn "\nChild process terminated with exit signal: '$?'\n\n";
277 }
278 }
279 }
280
281 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues
282 foreach my $fh (keys %fhs) {
283 if ($fh =~ /^[1230]$/) {
284 foreach my $context (keys %{$fhs{$fh}}) {
285 close $fhs{$fh}->{$context} or die $!;
286 }
287 }
288 else{
289 close $fhs{$fh} or die $!;
290 }
291 }
292
293 ### We need to stitch together a main splitting report from all individual parent/child processes
294 if ($multicore > 1){
295 merge_individual_splitting_reports($report_basename);
296 print_splitting_report();
297
298 merge_individual_mbias_reports($report_basename); # this updates the main %mbias_1 and %mbias_2 data structures so we can proceed normally
299
300 }
301
302 unless ($mbias_off){
303 ### printing out all M-Bias data
304 produce_mbias_plots ($filename); produce_mbias_plots ($filename);
305
306 }
307
308 unless ($mbias_only){
309 delete_unused_files();
310 }
311
312 if ($bedGraph){
313
314 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
315 $out =~ s/gz$//;
316 $out =~ s/sam$//;
317 $out =~ s/bam$//;
318 $out =~ s/txt$//;
319 $out =~ s/$/bedGraph/;
320
321 my $bedGraph_output = $out;
322 my @args;
323
324 if ($remove){
325 push @args, '--remove';
326 }
327 if ($CX_context){
328 push @args, '--CX_context';
329 }
330 if ($no_header){
331 push @args, '--no_header';
332 }
333 if ($gazillion){
334 push @args, '--gazillion';
335 }
336 if ($ample_mem){
337 push @args, '--ample_memory';
338 }
339 if ($zero){
340 push @args, "--zero";
341 }
342
343 # if ($counts){
344 # push @args, "--counts";
345 # }
346
347 push @args, "--buffer_size $sort_size";
348 push @args, "--cutoff $coverage_threshold";
349 push @args, "--output $bedGraph_output";
350 push @args, "--dir '$output_dir'";
351
352 ### adding all files to be sorted to @args
353 foreach my $f (@sorting_files){
354 push @args, $f;
355 }
356
357 # print join "\t",@args,"\n";
358
359 system ("$Bin/bismark2bedGraph @args");
360
361 warn "Finished BedGraph conversion ...\n\n";
362 sleep(1);
363
364 # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!";
365 # warn "Writing bedGraph to file: $bedGraph_output\n";
366 # process_bedGraph_output();
367 # close OUT or die $!;
368
369 ### genome-wide cytosine methylation report requires bedGraph processing anyway
370 if ($cytosine_report){
371
372 @args = (); # resetting @args
373 my $cytosine_out = $out;
374 $cytosine_out =~ s/bedGraph$//;
375
376 if ($CX_context){
377 $cytosine_out =~ s/$/CX_report.txt/;
378 }
379 else{
380 $cytosine_out =~ s/$/CpG_report.txt/;
381 }
382
383 push @args, "--output $cytosine_out";
384 push @args, "--dir '$output_dir'";
385 push @args, "--genome '$genome_folder'";
386 push @args, "--parent_dir '$parent_dir'";
387
388 if ($zero){
389 push @args, "--zero";
390 }
391 if ($CX_context){
392 push @args, '--CX_context';
393 }
394 if ($split_by_chromosome){
395 push @args, '--split_by_chromosome';
396 }
397 if ($gzip){
398 push @args, '--gzip';
399 }
400
401 my $coverage_output = $bedGraph_output;
402 $coverage_output =~ s/bedGraph$/bismark.cov.gz/;
403
404 push @args, $coverage_output; # this will be the infile
405 # warn "Handing over the following infile: $coverage_output\n"; sleep (5);
406 system ("$Bin/coverage2cytosine @args");
407
408 warn "\n\nFinished generating genome-wide cytosine report\n\n";
409 }
410 }
411 }
412
413 sub merge_individual_splitting_reports{
414
415 my $report_basename = shift;
416
417 my @splitting_reports; # only needed in multi-core mode to generate an overall report
418 foreach my $ext (1..$multicore){
419 push @splitting_reports, "$report_basename.$ext";
420 }
421 warn "\nMerging individual splitting reports into overall report: '$report_basename'\n";
422 warn "Merging from these individual files:\n";
423 print join ("\n",@splitting_reports),"\n\n";
424 sleep(1);
425
426 ##########
427 # resetting the counter first
428 %counting =(
429 total_meCHG_count => 0,
430 total_meCHH_count => 0,
431 total_meCpG_count => 0,
432 total_unmethylated_CHG_count => 0,
433 total_unmethylated_CHH_count => 0,
434 total_unmethylated_CpG_count => 0,
435 sequences_count => 0,
436 methylation_call_strings => 0,
437 );
438
439 # repopulating the merged counter
440 foreach my $file (@splitting_reports){
441 open (IR,$file) or die $!;
442 while (<IR>){
443 chomp;
444 my ($context,$count) = (split /\t/);
445 if ($context){
446 if ($context =~ /^Total C to T conversions in CpG context/){
447 # warn "context: $context\ncount: $count\n";
448 $counting{total_unmethylated_CpG_count} += $count;
449 }
450 elsif ($context =~ /^Total C to T conversions in CHG context/){
451 # warn "context: $context\ncount: $count\n";
452 $counting{total_unmethylated_CHG_count} += $count;
453 }
454 elsif ($context =~ /^Total C to T conversions in CHH context/){
455 # warn "context: $context\ncount: $count\n";
456 $counting{total_unmethylated_CHH_count} += $count;
457 }
458 elsif ($context =~ /^Total methylated C\'s in CpG context/){
459 # warn "context: $context\ncount: $count\n";
460 $counting{total_meCpG_count} += $count;
461 }
462 elsif ($context =~ /^Total methylated C\'s in CHG context/){
463 # warn "context: $context\ncount: $count\n";
464 $counting{total_meCHG_count} += $count;
465 }
466 elsif ($context =~ /^Total methylated C\'s in CHH context/){
467 # warn "context: $context\ncount: $count\n";
468 $counting{total_meCHH_count} += $count;
469 }
470 elsif ($context =~ /^line count/){
471 # warn "Line count\ncount: $count\n";
472 $counting{sequences_count} = $count; # always the same
473 }
474 elsif ($context =~ /^meth call strings/){
475 # warn "Meth call strings\ncount: $count\n";
476 $counting{methylation_call_strings} += $count;
477 }
478 }
479 }
480 }
481
482 # deleting the individual reports afterwards
483 foreach my $file (@splitting_reports){
484 unlink $file;
485 }
486 }
487
488
489 sub merge_individual_mbias_reports{
490
491 my $report_basename = shift;
492
493 my @mbias_reports; # only needed in multi-core mode to generate an overall report
494 foreach my $ext (1..$multicore){
495 push @mbias_reports, "$report_basename.${ext}.mbias";
496 }
497 warn "\nMerging individual M-bias reports into overall M-bias statistics from these $multicore individual files:\n";
498 print join ("\n",@mbias_reports),"\n\n";
499
500
501 ##########
502 # resetting the counters first, then repopulating them
503 %mbias_1 = ();
504 %mbias_2 = ();
505
506 # repopulating the merged counter
507 foreach my $file (@mbias_reports){
508 open (IR,$file) or die $!;
509
510 my $context;
511 my $read;
512
513 while (<IR>){
514 chomp;
515 # warn "$_\n"; sleep(1);
516 if ($_ =~ /context/){
517 $context = $1 if ($_ =~ /(\D{3}) context/);
518 # warn "Context set as $context\n";
519
520 if ($_ =~ /R2/){
521 $read = 'R2';
522 }
523 else{
524 $read = 'R1';
525 }
526 # warn "Setting read identity to '$read'\n";
527
528 # reading in 2 additional lines (===========, and header line)
529 $_ = <IR>;
530 #warn "discarding line $_\n";
531 $_ = <IR>;
532 #warn "discarding line $_\n";
533 next;
534 }
535 else{
536
537 if ($_ eq ''){
538 # empty line, only occurs after a context has finished and before a new context starts
539 next;
540 }
541
542 my ($pos,$meth,$unmeth) = (split /\t/);
543 # warn "$pos\t$meth\t$unmeth\n"; sleep(1);
544 if ($read eq 'R1'){
545 $mbias_1{$context}->{$pos}->{meth} += $meth;
546 $mbias_1{$context}->{$pos}->{un} += $unmeth;
547 }
548 elsif ($read eq 'R2'){
549 $mbias_2{$context}->{$pos}->{meth} += $meth;
550 $mbias_2{$context}->{$pos}->{un} += $unmeth;
551 }
552 }
553 }
554 close IR or warn "Had trouble closing filehandle for $file: $!\n";
555 }
556
557 # deleting the individual reports afterwards
558 foreach my $file (@mbias_reports){
559 unlink $file;
560 }
561 }
562
563
564 sub delete_unused_files{
565
566 warn "Deleting unused files ...\n\n"; sleep(1);
567
568 my $index = 0;
569
570 while ($index <= $#sorting_files){
571 if ($sorting_files[$index] =~ /gz$/){
572 open (USED,"gunzip -c $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
573 }
574 else{
575 open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
576 }
577
578 my $used = 0;
579
580 while (<USED>){
581 next if (/^Bismark/);
582 if ($_){
583 $used = 1;
584 last;
585 }
586 }
587
588 if ($used){
589 warn "$sorting_files[$index] contains data ->\tkept\n";
590 ++$index;
591 }
592 else{
593
594 my $delete = unlink $sorting_files[$index];
595
596 if ($delete){
597 warn "$sorting_files[$index] was empty ->\tdeleted\n";
598 }
599 else{
600 warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n"
601 }
602
603 ### we also need to remove the element from @sorting_files
604 splice @sorting_files, $index, 1;
605 }
606 }
607 warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately)
608 }
609
610 sub produce_mbias_plots{
611
612 my $filename = shift;
613
614 my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
615 $mbias =~ s/gz$//;
616 $mbias =~ s/sam$//;
617 $mbias =~ s/bam$//;
618 $mbias =~ s/cram$//;
619 $mbias =~ s/txt$//;
620 my $mbias_graph_1 = my $mbias_graph_2 = $mbias;
621 $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png';
622 $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png';
623
624 $mbias =~ s/$/M-bias.txt/;
625
626 open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n";
627
628 # determining maximum read length
629 my $max_length_1 = 0;
630 my $max_length_2 = 0;
631
632 foreach my $context (keys %mbias_1){
633 foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){
634 $max_length_1 = $pos unless ($max_length_1 >= $pos);
635 }
636 }
637 if ($paired){
638 foreach my $context (keys %mbias_2){
639 foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){
640 $max_length_2 = $pos unless ($max_length_2 >= $pos);
641 }
642 }
643 }
644
645 if ($single){
646 warn "Determining maximum read length for M-Bias plot\n";
647 warn "Maximum read length of Read 1: $max_length_1\n\n";
648 }
649 else{
650 warn "Determining maximum read lengths for M-Bias plots\n";
651 warn "Maximum read length of Read 1: $max_length_1\n";
652 warn "Maximum read length of Read 2: $max_length_2\n\n";
653 }
654 # sleep(3);
655
656 my @mbias_read1;
657 my @mbias_read2;
658
659 #Check whether the module GD::Graph:lines is installed
660 my $gd_graph_installed = 0;
661 eval{
662 require GD::Graph::lines;
663 GD::Graph::lines->import();
664 };
665
666 unless($@) { # syntax or routine error variable, set if something goes wrong in the last eval{ require ...}
667 $gd_graph_installed = 1;
668
669 #Check whether the module GD::Graph::colour is installed
670 eval{
671 require GD::Graph::colour;
672 GD::Graph::colour->import(qw(:colours :lists :files :convert));
673 };
674
675 if ($@) {
676 warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
677 sleep(2);
678 $gd_graph_installed = 0;
679 }
680
681
682 }
683 else{
684 warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
685 sleep(2);
686 }
687
688
689 my $graph_title;
690 my $graph1;
691 my $graph2;
692
693 if ( $gd_graph_installed){
694 $graph1 = GD::Graph::lines->new(800,600);
695 if ($paired){
696 $graph2 = GD::Graph::lines->new(800,600);
697 }
698 }
699
700 foreach my $context (qw(CpG CHG CHH)){
701 @{$mbias_read1[0]} = ();
702
703 if ($paired){
704 print MBIAS "$context context (R1)\n================\n";
705 $graph_title = 'M-bias (Read 1)';
706 }
707 else{
708 print MBIAS "$context context\n===========\n";
709 $graph_title = 'M-bias';
710 }
711 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
712
713 foreach my $pos (1..$max_length_1){
714
715 unless (defined $mbias_1{$context}->{$pos}->{meth}){
716 $mbias_1{$context}->{$pos}->{meth} = 0;
717 }
718 unless (defined $mbias_1{$context}->{$pos}->{un}){
719 $mbias_1{$context}->{$pos}->{un} = 0;
720 }
721
722 my $percent = '';
723 if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){
724 $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) );
725 }
726 my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth};
727
728 print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n";
729 push @{$mbias_read1[0]},$pos;
730
731 if ($context eq 'CpG'){
732 push @{$mbias_read1[1]},$percent;
733 push @{$mbias_read1[4]},$coverage;
734 }
735 elsif ($context eq 'CHG'){
736 push @{$mbias_read1[2]},$percent;
737 push @{$mbias_read1[5]},$coverage;
738 }
739 elsif ($context eq 'CHH'){
740 push @{$mbias_read1[3]},$percent;
741 push @{$mbias_read1[6]},$coverage;
742 }
743 }
744 print MBIAS "\n";
745 }
746
747 if ( $gd_graph_installed){
748
749 add_colour(nice_blue => [31,120,180]);
750 add_colour(nice_orange => [255,127,0]);
751 add_colour(nice_green => [51,160,44]);
752 add_colour(pale_blue => [153,206,227]);
753 add_colour(pale_orange => [253,204,138]);
754 add_colour(pale_green => [191,230,207]);
755
756 $graph1->set(
757 x_label => 'position (bp)',
758 y1_label => '% methylation',
759 y2_label => '# methylation calls',
760 title => $graph_title,
761 line_width => 2,
762 x_max_value => $max_length_1,
763 x_min_value => 0,
764 y_tick_number => 10,
765 y_label_skip => 2,
766 y1_max_value => 100,
767 y1_min_value => 0,
768 y_label_skip => 2,
769 y2_min_value => 0,
770 x_label_skip => 5,
771 x_label_position => 0.5,
772 x_tick_offset => -1,
773 bgclr => 'white',
774 transparent => 0,
775 two_axes => 1,
776 use_axis => [1,1,1,2,2,2],
777 legend_placement => 'RC',
778 legend_spacing => 6,
779 legend_marker_width => 24,
780 legend_marker_height => 18,
781 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
782 ) or die $graph1->error;
783
784 $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
785
786 ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014)
787 if (my $gd1 = $graph1->plot(\@mbias_read1)) {
788 open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n";
789 binmode MBIAS_G1;
790 print MBIAS_G1 $gd1->png;
791 }
792 else {
793 warn "WARNING: Cannot generate read 1 M-bias plot: " , $graph1->error , "\n\n";
794 }
795
796 # my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error;
797 # open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n";
798 # binmode MBIAS_G1;
799 # print MBIAS_G1 $gd1->png;
800 }
801
802 if ($paired){
803
804 foreach my $context (qw(CpG CHG CHH)){
805 @{$mbias_read2[0]} = ();
806
807 print MBIAS "$context context (R2)\n================\n";
808 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
809 foreach my $pos (1..$max_length_2){
810
811 unless (defined $mbias_2{$context}->{$pos}->{meth}){
812 $mbias_2{$context}->{$pos}->{meth} = 0;
813 }
814 unless (defined $mbias_2{$context}->{$pos}->{un}){
815 $mbias_2{$context}->{$pos}->{un} = 0;
816 }
817
818 my $percent = '';
819 if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){
820 $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) );
821 }
822 my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth};
823
824 print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n";
825
826 push @{$mbias_read2[0]},$pos;
827
828 if ($context eq 'CpG'){
829 push @{$mbias_read2[1]},$percent;
830 push @{$mbias_read2[4]},$coverage;
831 }
832 elsif ($context eq 'CHG'){
833 push @{$mbias_read2[2]},$percent;
834 push @{$mbias_read2[5]},$coverage;
835 }
836 elsif ($context eq 'CHH'){
837 push @{$mbias_read2[3]},$percent;
838 push @{$mbias_read2[6]},$coverage;
839 }
840 }
841 print MBIAS "\n";
842 }
843
844 if ( $gd_graph_installed){
845
846 add_colour(nice_blue => [31,120,180]);
847 add_colour(nice_orange => [255,127,0]);
848 add_colour(nice_green => [51,160,44]);
849 add_colour(pale_blue => [153,206,227]);
850 add_colour(pale_orange => [253,204,138]);
851 add_colour(pale_green => [191,230,207]);
852
853 $graph2->set(
854 x_label => 'position (bp)',
855 line_width => 2,
856 x_max_value => $max_length_1,
857 x_min_value => 0,
858 y_tick_number => 10,
859 y_label_skip => 2,
860 y1_max_value => 100,
861 y1_min_value => 0,
862 y_label_skip => 2,
863 y2_min_value => 0,
864 x_label_skip => 5,
865 x_label_position => 0.5,
866 x_tick_offset => -1,
867 bgclr => 'white',
868 transparent => 0,
869 two_axes => 1,
870 use_axis => [1,1,1,2,2,2],
871 legend_placement => 'RC',
872 legend_spacing => 6,
873 legend_marker_width => 24,
874 legend_marker_height => 18,
875 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
876 x_label => 'position (bp)',
877 y1_label => '% methylation',
878 y2_label => '# calls',
879 title => 'M-bias (Read 2)',
880 ) or die $graph2->error;
881
882 $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
883
884 ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014)
885 if (my $gd2 = $graph2->plot(\@mbias_read2)) {
886 open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n";
887 binmode MBIAS_G2;
888 print MBIAS_G2 $gd2->png;
889 }
890 else {
891 warn "WARNING: Cannot generate Read 2 M-bias plot: " , $graph2->error , "\n\n";
892 }
893
894 # my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error;
895 # open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n";
896 # binmode MBIAS_G2;
897 # print MBIAS_G2 $gd2->png;
898
899 }
900 }
901 }
902
903 sub process_commandline{
904 my $help;
905 my $single_end;
906 my $paired_end;
907 my $ignore;
908 my $ignore_r2;
909 my $genomic_fasta;
910 my $full;
911 my $report;
912 my $extractor_version;
913 my $no_overlap;
914 my $merge_non_CpG;
915 my $vanilla;
916 my $output_dir;
917 my $no_header;
918 my $bedGraph;
919 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status
920 my $remove;
921 my $counts;
922 my $cytosine_report;
923 my $genome_folder;
924 my $zero;
925 my $CpG_only;
926 my $CX_context;
927 my $split_by_chromosome;
928 my $sort_size;
929 my $samtools_path;
930 my $gzip;
931 my $mbias_only;
932 my $mbias_off;
933 my $gazillion;
934 my $ample_mem;
935 my $include_overlap;
936 my $ignore_3prime;
937 my $ignore_3prime_r2;
938 my $multicore;
939
940 my $command_line = GetOptions ('help|man' => \$help,
941 'p|paired-end' => \$paired_end,
942 's|single-end' => \$single_end,
943 'fasta' => \$genomic_fasta,
944 'ignore=i' => \$ignore,
945 'ignore_r2=i' => \$ignore_r2,
946 'comprehensive' => \$full,
947 'report' => \$report,
948 'version' => \$extractor_version,
949 'no_overlap' => \$no_overlap,
950 'merge_non_CpG' => \$merge_non_CpG,
951 'vanilla' => \$vanilla,
952 'o|output=s' => \$output_dir,
953 'no_header' => \$no_header,
954 'bedGraph' => \$bedGraph,
955 "cutoff=i" => \$coverage_threshold,
956 "remove_spaces" => \$remove,
957 "counts" => \$counts,
958 "cytosine_report" => \$cytosine_report,
959 'g|genome_folder=s' => \$genome_folder,
960 "zero_based" => \$zero,
961 "CX|CX_context" => \$CX_context,
962 "split_by_chromosome" => \$split_by_chromosome,
963 "buffer_size=s" => \$sort_size,
964 'samtools_path=s' => \$samtools_path,
965 'gzip' => \$gzip,
966 'mbias_only' => \$mbias_only,
967 'mbias_off' => \$mbias_off,
968 'gazillion|scaffolds' => \$gazillion,
969 'ample_memory' => \$ample_mem,
970 'include_overlap' => \$include_overlap,
971 'ignore_3prime=i' => \$ignore_3prime,
972 'ignore_3prime_r2=i' => \$ignore_3prime_r2,
973 'multicore=i' => \$multicore,
974 );
975
976 ### EXIT ON ERROR if there were errors with any of the supplied options
977 unless ($command_line){
978 die "Please respecify command line options\n";
979 }
980
981 ### HELPFILE
982 if ($help){
983 print_helpfile();
984 exit;
985 }
986
987 if ($extractor_version){
988 print << "VERSION";
989
990
991 Bismark Methylation Extractor
992
993 Bismark Extractor Version: $version
994 Copyright 2010-15 Felix Krueger, Babraham Bioinformatics
995 www.bioinformatics.babraham.ac.uk/projects/bismark/
996
997
998 VERSION
999 exit;
1000 }
1001
1002
1003 ### no files provided
1004 unless (@ARGV){
1005 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
1006 }
1007 @filenames = @ARGV;
1008
1009 warn "\n *** Bismark methylation extractor version $version ***\n\n";
1010
1011 ### M-BIAS ONLY
1012 if ($mbias_only){
1013
1014 if ($mbias_off){
1015 die "Options '--mbias_only' and '--mbias_off' are not compatible. Just pick one, mkay?\n";
1016 }
1017 if ($bedGraph){
1018 die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n";
1019 }
1020 if ($cytosine_report){
1021 die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n";
1022 }
1023 if ($merge_non_CpG){
1024 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n";
1025 }
1026 if ($full){
1027 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n";
1028 }
1029 sleep(3);
1030 }
1031
1032 ### PRINT A REPORT
1033 unless ($report){
1034 $report = 1; # making this the default
1035 }
1036
1037 ### OUTPUT DIR PATH
1038 if ($output_dir){
1039 unless ($output_dir =~ /\/$/){
1040 $output_dir =~ s/$/\//;
1041 }
1042 }
1043 else{
1044 $output_dir = '';
1045 }
1046
1047 ### NO HEADER
1048 unless ($no_header){
1049 $no_header = 0;
1050 }
1051
1052 ### OLD (VANILLA) OUTPUT FORMAT
1053 unless ($vanilla){
1054 $vanilla = 0;
1055 }
1056
1057 if ($single_end){
1058 $paired_end = 0; ### SINGLE END ALIGNMENTS
1059 }
1060 elsif ($paired_end){
1061 $single_end = 0; ### PAIRED-END ALIGNMENTS
1062 }
1063 else{
1064
1065 ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header
1066
1067 if ($vanilla){
1068 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n";
1069 }
1070 else{ # SAM/BAM format
1071
1072 my $file = $filenames[0];
1073 warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1);
1074
1075 ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file
1076 if ($file =~ /\.gz$/){
1077 open (DETERMINE,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
1078 }
1079 elsif ($file =~ /\.cram$/ || $file =~ /\.bam$/ || isBam($file) ){ ### this would allow to read BAM files that do not end in *.bam
1080 open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM/CRAM file $file: $!\n";
1081 }
1082 else{
1083 open (DETERMINE,$file) or die "Unable to read from $file: $!\n";
1084 }
1085
1086 while (<DETERMINE>){
1087 last unless (/^\@/);
1088 if ($_ =~ /^\@PG/){
1089 # warn "found the \@PG line:\n";
1090 # warn "$_";
1091
1092 if ($_ =~ /\s+-1\s+/ and $_ =~ /\s+-2\s+/){
1093 warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1);
1094 $paired_end = 1;
1095 $single_end = 0;
1096 }
1097 else{
1098 warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1);
1099 $paired_end = 0;
1100 $single_end = 1;
1101 }
1102 }
1103 }
1104
1105 # close DETERMINE or warn $!; # this always throws an error anyway...
1106
1107 }
1108 }
1109
1110 ### IGNORING <INT> 5' END
1111 # bases at the start of the read when processing the methylation call string
1112 unless ($ignore){
1113 $ignore = 0;
1114 }
1115
1116 if (defined $ignore_r2){
1117 die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end);
1118 }
1119 else{
1120 $ignore_r2 = 0;
1121 }
1122
1123 ### IGNORING <INT> 3' END
1124 # bases at the end of the read when processing the methylation call string
1125 unless ($ignore_3prime){
1126 $ignore_3prime = 0;
1127 }
1128
1129 if (defined $ignore_3prime_r2){
1130 die "You can only specify --ignore_3prime_r2 for paired-end result files\n" unless ($paired_end);
1131 }
1132 else{
1133 $ignore_3prime_r2 = 0;
1134 }
1135
1136
1137 ### NO OVERLAP
1138 ### --no_overlap is the default (as of version 0.12.6), unless someone explicitly asks to include overlaps
1139 if ($include_overlap){
1140 die "The option '--include_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
1141 warn "Setting option '--inlcude_overlap' for paired-end data (user-defined)\n\n";
1142 $no_overlap = 0;
1143 }
1144 else{ # default
1145 if ($paired_end){
1146 warn "Setting option '--no_overlap' since this is (normally) the right thing to do for paired-end data\n\n";
1147 $no_overlap = 1;
1148 }
1149 }
1150
1151 ### COMPREHENSIVE OUTPUT
1152 unless ($full){
1153 $full = 0;
1154 }
1155
1156 ### MERGE NON-CpG context
1157 unless ($merge_non_CpG){
1158 $merge_non_CpG = 0;
1159 }
1160
1161 ### remove white spaces in read ID (needed for sorting using the sort command
1162 unless ($remove){
1163 $remove = 0;
1164 }
1165
1166 ### COVERAGE THRESHOLD FOR bedGraph OUTPUT
1167 if (defined $coverage_threshold){
1168 unless ($coverage_threshold > 0){
1169 die "Please select a coverage greater than 0 (positive integers only)\n";
1170 }
1171 }
1172 else{
1173 $coverage_threshold = 1;
1174 }
1175
1176 ### SORT buffer size
1177 if (defined $sort_size){
1178 unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){
1179 die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n";
1180 }
1181 }
1182 else{
1183 $sort_size = '2G';
1184 }
1185
1186 if ($zero){
1187 die "Option '--zero' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
1188 }
1189
1190 if ($CX_context){
1191 die "Option '--CX_context' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
1192 }
1193 else{
1194 $CX_context = 0;
1195 }
1196
1197 unless ($counts){
1198 $counts = 1; # counts will always be set
1199 }
1200
1201 if ($cytosine_report){
1202
1203 ### GENOME folder
1204 if ($genome_folder){
1205 unless ($genome_folder =~/\/$/){
1206 $genome_folder =~ s/$/\//;
1207 }
1208 }
1209 else{
1210 die "Please specify a genome folder to proceed (full path only)\n";
1211 }
1212
1213 unless ($bedGraph){
1214 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n";
1215 $bedGraph = 1;
1216 }
1217 unless ($counts){
1218 # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n";
1219 $counts = 1;
1220 }
1221 warn "\n";
1222 }
1223
1224 ### PATH TO SAMTOOLS
1225 if (defined $samtools_path){
1226 # if Samtools was specified as full command
1227 if ($samtools_path =~ /samtools$/){
1228 if (-e $samtools_path){
1229 # Samtools executable found
1230 }
1231 else{
1232 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
1233 }
1234 }
1235 else{
1236 unless ($samtools_path =~ /\/$/){
1237 $samtools_path =~ s/$/\//;
1238 }
1239 $samtools_path .= 'samtools';
1240 if (-e $samtools_path){
1241 # Samtools executable found
1242 }
1243 else{
1244 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
1245 }
1246 }
1247 }
1248 # Check whether Samtools is in the PATH if no path was supplied by the user
1249 else{
1250 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
1251 $samtools_path = `which samtools`;
1252 chomp $samtools_path;
1253 }
1254 }
1255
1256 unless (defined $samtools_path){
1257 $samtools_path = '';
1258 }
1259
1260
1261 if ($gazillion){
1262 if ($ample_mem){
1263 die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n";
1264 }
1265 }
1266
1267 if (defined $multicore){
1268 unless ($multicore > 0){
1269 die "Core usage needs to be set to 1 or more (currently selected $multicore). Please respecify!\n";
1270 }
1271 if ($multicore > 20){
1272 warn "Core usage currently set to more than 20 threads. Let's see how this goes... (set value: $multicore)\n\n";
1273 }
1274 }
1275 else{
1276 $multicore = 1; # default. Single-thread mode
1277 warn "Setting core usage to single-threaded (default). Consider using --multicore <int> to speed up the extraction process.\n\n";
1278 }
1279
1280 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore);
1281 }
1282
1283
1284 sub test_positional_sorting{
1285
1286 my $filename = shift;
1287
1288 print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t";
1289 sleep(1);
1290
1291 if ($filename =~ /\.gz$/) {
1292 open (TEST,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n";
1293 }
1294 elsif ($filename =~ /bam$/ || $filename =~ /cram$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam
1295 if ($samtools_path){
1296 open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM/CRAM file $filename: $!\n";
1297 }
1298 else{
1299 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
1300 }
1301 }
1302 else {
1303 open (TEST,$filename) or die "Can't open file $filename: $!\n";
1304 }
1305
1306 my $count = 0;
1307
1308 while (<TEST>) {
1309 if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted)
1310 if (/^\@SO/) {
1311 die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n";
1312 }
1313 next;
1314 }
1315 $count++;
1316
1317 last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID
1318
1319 my ($id_1) = (split (/\t/));
1320
1321 ### reading the next line which should be read 2
1322 $_ = <TEST>;
1323 my ($id_2) = (split (/\t/));
1324 last unless ($id_2);
1325 ++$count;
1326
1327 if ($id_1 eq $id_2){
1328 ### ids are the same
1329 next;
1330 }
1331 else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first
1332 my $id_1_trunc = $id_1;
1333 $id_1_trunc =~ s/\/1$//;
1334 my $id_2_trunc = $id_2;
1335 $id_2_trunc =~ s/\/2$//;
1336
1337 unless ($id_1_trunc eq $id_2_trunc){
1338 die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n";
1339 }
1340 }
1341 }
1342 # close TEST or die $!; somehow fails on our cluster...
1343 ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other)
1344 warn "...passed!\n";
1345 sleep(1);
1346
1347 }
1348
1349 sub process_Bismark_results_file{
1350
1351 my $filename = shift;
1352 my $report_filename = open_output_filehandles($filename);
1353
1354 ### disabling buffering so we don't run into problems with half written out lines...
1355 foreach my $fh (keys %fhs){
1356 if ($fh =~ /^[1230]$/) {
1357 foreach my $context (keys %{$fhs{$fh}}) {
1358 select($fhs{$fh}->{$context});
1359 $|++;
1360 }
1361 }
1362 else{
1363 select($fhs{$fh});
1364 $|++;
1365 }
1366 }
1367 select(STDOUT);
1368
1369 ################################################
1370 ################################################
1371 ### multi-process handling
1372 ###
1373
1374 my $offset = 1;
1375 my $process_id;
1376 my @pids;
1377 if ($multicore > 1){
1378
1379 until ($offset == $multicore){
1380 # warn "multicore: $multicore\noffset: $offset\n";
1381 my $fork = fork;
1382
1383 if (defined $fork){
1384 if ($fork != 0){
1385 $process_id = $fork;
1386 push @pids, $process_id;
1387 if ($offset < $multicore){
1388 ++$offset;
1389 # warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n";
1390 }
1391 else{
1392 # warn "Reached the number of maximum multicores. Proceeeding to processing...\n";
1393 }
1394 }
1395 elsif ($fork == 0){
1396 # warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n";
1397 $process_id = $fork;
1398 last;
1399 }
1400 }
1401 else{
1402 die "Forking unsuccessful. Proceeding using a single thread only\n";
1403 }
1404 }
1405
1406 # warn "\nThe thread identity\n===================\n";
1407 if ($process_id){
1408 # print "I am the parent process. My children are called:\n";
1409 # print join ("\t",@pids),"\n";
1410 # print "I am going to process the following line count: $offset\n\n";
1411 }
1412 elsif($process_id == 0){
1413 # warn "I am a child process: Process ID: $process_id\n";
1414 # warn "I am going to process the following line count: $offset\n\n";
1415 }
1416 else{
1417 die "Process ID was: $process_id\n";
1418 }
1419 }
1420 else{
1421 # warn "Single-core mode: setting pid to 1\n";
1422 $process_id = 1;
1423 }
1424
1425 ################################################
1426 ################################################
1427 if ($process_id){
1428 warn "Now reading in Bismark result file $filename\n";
1429 }
1430 else{
1431 warn "\nNow reading in Bismark result file $filename\n\n";
1432 }
1433
1434 if ($filename =~ /\.gz$/) {
1435 open (IN,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n";
1436 }
1437 elsif ($filename =~ /bam$/ || $filename =~ /cram$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam
1438 if ($samtools_path){
1439 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM/CRAM file $filename: $!\n";
1440 }
1441 else{
1442 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
1443 }
1444 }
1445 else {
1446 open (IN,$filename) or die "Can't open file $filename: $!\n";
1447 }
1448
1449 ### Vanilla and SAM output need to read different numbers of header lines
1450 if ($vanilla) {
1451 my $bismark_version = <IN>; ## discarding the Bismark version info
1452 chomp $bismark_version;
1453 $bismark_version =~ s/\r//; # replaces \r line feed
1454 $bismark_version =~ s/Bismark version: //;
1455 if ($bismark_version =~ /^\@/) {
1456 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
1457 sleep (1);
1458 }
1459
1460 unless ($version eq $bismark_version){
1461 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
1462 }
1463 } else {
1464 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
1465 # We are reading from it further down
1466 }
1467
1468 my $methylation_call_strings_processed = 0;
1469 my $line_count = 0;
1470
1471 ### proceeding differently now for single-end or paired-end Bismark files
1472
1473 ### PROCESSING SINGLE-END RESULT FILES
1474 if ($single) {
1475
1476 ### also proceeding differently now for SAM format or vanilla Bismark format files
1477 if ($vanilla) { # old vanilla Bismark output format
1478 while (<IN>) {
1479 ++$line_count;
1480 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
1481
1482 if ( ($line_count - $offset)%$multicore == 0){
1483 # warn "line count: $line_count\noffset: $offset\n";
1484 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
1485 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
1486 }
1487 else{
1488 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
1489 next;
1490 }
1491
1492 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
1493 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
1494
1495 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
1496 ### last position
1497 chomp $genome_conversion;
1498
1499 my $index;
1500 if ($meth_call) {
1501
1502 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
1503 $index = 0;
1504 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
1505 $index = 1;
1506 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
1507 $index = 3;
1508 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
1509 $index = 2;
1510 } else {
1511 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
1512 }
1513
1514 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
1515 if ($ignore) {
1516 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
1517
1518 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
1519 if ($strand eq '+') {
1520 $start += $ignore;
1521 }
1522 elsif ($strand eq '-') {
1523 $start += length($meth_call)-1; ## $meth_call is already shortened!
1524 }
1525 else {
1526 die "Alignment did not have proper strand information: $strand\n";
1527 }
1528 }
1529
1530 ### Clipping off the last <int> number of bases from the methylation call string as specified with --ignore_3prime <int>
1531 if ($ignore_3prime) {
1532
1533 $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime);
1534
1535 ### If we are clipping off some bases at the end we need to adjust the end position of the alignments accordingly
1536 if ($strand eq '+') {
1537 # clipping the 3' end does not affect the starting position # ignore 5' has already been taken care of, if relevant at all
1538 }
1539 elsif ($strand eq '-') {
1540 # here we need to discriminate if the start has been adjusted because of --ignore or not
1541 if ($ignore){
1542 # position adjusted already, and because of this 3' trimming is irrelevant for the start position
1543 }
1544 else{
1545 # Here we need to add the length ignore_3prime to the read starting position, adjustment of the start position will take place later in the methylation extraction step
1546 $start += $ignore_3prime;
1547 }
1548 }
1549 else {
1550 die "Alignment did not have proper strand information: $strand\n";
1551 }
1552 }
1553 ### just as a comment, if --ignore has not been specified the starting position of reverse reads is adjusted later at the methylation extraction stage
1554
1555 ### printing out the methylation state of every C in the read
1556 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
1557
1558 ++$methylation_call_strings_processed; # 1 per single-end result
1559 }
1560 }
1561 } else { # processing single-end SAM format (default)
1562 while (<IN>) {
1563 chomp;
1564 ### SAM format can either start with header lines (starting with @) or start with alignments directly
1565 if (/^\@/) { # skipping header lines (starting with @)
1566 warn "skipping SAM header line:\t$_\n" unless ($process_id == 0);
1567 next;
1568 }
1569
1570 ++$line_count;
1571 # warn "$line_count\n";
1572 warn "Processed lines: $line_count\n" if ($line_count%500000 == 0);
1573
1574 if ( ($line_count - $offset)%$multicore == 0){
1575 # warn "line count: $line_count\noffset: $offset\n";
1576 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
1577 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
1578 }
1579 else{
1580 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
1581 next;
1582 }
1583
1584 # example read in SAM format
1585 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
1586 ###
1587
1588 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
1589 # < 0.7.6 $meth_call =~ s/^XM:Z://;
1590 # < 0.7.6 $read_conversion =~ s/^XR:Z://;
1591 # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
1592
1593 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
1594
1595 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
1596 my $meth_call; ### Thanks to Zachary Zeno for this solution
1597 my $read_conversion;
1598 my $genome_conversion;
1599
1600 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
1601 my $tag = $1;
1602 my $value = $2;
1603
1604 if ($tag eq "XM") {
1605 $meth_call = $value;
1606 $meth_call =~ s/\r//;
1607 } elsif ($tag eq "XR") {
1608 $read_conversion = $value;
1609 $read_conversion =~ s/\r//;
1610 } elsif ($tag eq "XG") {
1611 $genome_conversion = $value;
1612 $genome_conversion =~ s/\r//;
1613 }
1614 }
1615
1616 my $strand;
1617 # warn "$meth_call\n$read_conversion\n$genome_conversion\n";
1618
1619 my $index;
1620 if ($meth_call) {
1621 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
1622 $index = 0;
1623 $strand = '+';
1624 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
1625 $index = 1;
1626 $strand = '-';
1627 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
1628 $index = 2;
1629 $strand = '+';
1630 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
1631 $index = 3;
1632 $strand = '-';
1633 } else {
1634 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
1635 }
1636
1637 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
1638 if ($strand eq '-') {
1639 $meth_call = reverse $meth_call;
1640 }
1641 # warn "\n$meth_call\n";
1642
1643 ### IGNORE 5 PRIME: Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
1644 if ($ignore) {
1645 # warn "\n\n$meth_call\n";
1646 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
1647 # warn "$meth_call\n";sleep(1);
1648
1649 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
1650
1651 my @len = split (/\D+/,$cigar); # storing the length per operation
1652 my @ops = split (/\d+/,$cigar); # storing the operation
1653 shift @ops; # remove the empty first element
1654 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
1655
1656 my @comp_cigar; # building an array with all CIGAR operations
1657 foreach my $index (0..$#len) {
1658 foreach (1..$len[$index]) {
1659 # print "$ops[$index]";
1660 push @comp_cigar, $ops[$index];
1661 }
1662 }
1663 # print "original CIGAR: $cigar\n";
1664 # print "original CIGAR: @comp_cigar\n";
1665
1666 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
1667 if ($strand eq '+') {
1668
1669 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
1670 my $I_count = 0;
1671
1672 for (1..$ignore) {
1673 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
1674 # print "$_ deleted $op\n";
1675
1676 while ($op eq 'D') { # repeating this for deletions (D)
1677 $D_count++;
1678 $op = shift @comp_cigar;
1679 # print "$_ deleted $op\n";
1680 }
1681 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
1682 $I_count++;
1683 }
1684 }
1685 $start += $ignore + $D_count - $I_count;
1686 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
1687 }
1688 elsif ($strand eq '-') {
1689
1690 for (1..$ignore) {
1691 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
1692 while ($op eq 'D') { # repeating this for deletions (D)
1693 $op = pop @comp_cigar;
1694 }
1695 }
1696
1697 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
1698 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
1699 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
1700 foreach (@comp_cigar) {
1701 ++$MD_count if ($_ eq 'M' or $_ eq 'D');
1702 }
1703 $start += $MD_count - 1;
1704 }
1705
1706 ### reconstituting shortened CIGAR string
1707 my $new_cigar;
1708 my $count = 0;
1709 my $last_op;
1710 # print "ignore adjusted: @comp_cigar\n";
1711 foreach my $op (@comp_cigar) {
1712 unless (defined $last_op){
1713 $last_op = $op;
1714 ++$count;
1715 next;
1716 }
1717 if ($last_op eq $op) {
1718 ++$count;
1719 } else {
1720 $new_cigar .= "$count$last_op";
1721 $last_op = $op;
1722 $count = 1;
1723 }
1724 }
1725 $new_cigar .= "$count$last_op"; # appending the last operation and count
1726 $cigar = $new_cigar;
1727 # print "ignore adjusted scalar: $cigar\n";
1728 }
1729
1730 #######################
1731 ### INGORE 3' END ###
1732 #######################
1733
1734 # Clipping off the last <int> number of bases from the methylation call string as specified with --ignore_3prime <int>
1735 if ($ignore_3prime) {
1736 # warn "$meth_call\n";
1737 $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime);
1738 # warn "$meth_call\n";sleep(1);
1739
1740 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
1741
1742 my @len = split (/\D+/,$cigar); # storing the length per operation
1743 my @ops = split (/\d+/,$cigar); # storing the operation
1744 shift @ops; # remove the empty first element
1745 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
1746
1747 my @comp_cigar; # building an array with all CIGAR operations
1748 foreach my $index (0..$#len) {
1749 foreach (1..$len[$index]) {
1750 # print "$ops[$index]";
1751 push @comp_cigar, $ops[$index];
1752 }
1753 }
1754
1755 # print "original CIGAR: $cigar\n";
1756 # print join ("",@comp_cigar),"\n";
1757
1758 ### If we are clipping off some bases at the end we might have to adjust the start position of the alignments accordingly
1759 if ($strand eq '+') {
1760
1761 ### clipping the 3' end does not affect the starting position of forward strand alignments
1762 # ignore 5' has already been taken care of at this stage, if relevant at all
1763
1764 for (1..$ignore_3prime) {
1765 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations from the end
1766 # print join ("",@comp_cigar),"\n";
1767 # print "$_ deleted $op from 3' end\n";
1768 while ($op eq 'D') { # repeating this for deletions (D)
1769 $op = pop @comp_cigar;
1770 # print join ("",@comp_cigar),"\n";
1771 # print "$_ deleted $op from 3' end\n";
1772 }
1773 }
1774 # print "Final truncated CIGAR string:\n";
1775 # print join ("",@comp_cigar),"\n";
1776 # $start += $ignore + $D_count - $I_count;
1777 # print "start $start\t ignore_3prime: $ignore_3prime\t D count: $D_count I_count: $I_count\n";
1778 }
1779 elsif ($strand eq '-') {
1780
1781 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
1782 my $I_count = 0;
1783
1784 for (1..$ignore_3prime) {
1785 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array
1786 while ($op eq 'D') { # repeating this for deletions (D)
1787 $D_count++;
1788 $op = shift @comp_cigar;
1789 }
1790 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
1791 $I_count++;
1792 }
1793
1794 }
1795
1796 # here we need to discriminate if the start has been adjusted because of --ignore or not
1797 if ($ignore){
1798 # the start position has already been modified for --ignore already, so we don't have to adjust the position again now
1799 }
1800 else{
1801 # Here we need to add the length ignore_3prime to the read starting position
1802 # adjustment of the true starting position of this reverse read will take place later in the methylation extraction step
1803 $start += $ignore_3prime + $D_count - $I_count;
1804 }
1805
1806 }
1807
1808 ### reconstituting shortened CIGAR string
1809 my $new_cigar;
1810 my $count = 0;
1811 my $last_op;
1812 # print "ignore_3prime adjusted:\n"; print join ("",@comp_cigar),"\n";
1813 foreach my $op (@comp_cigar) {
1814 unless (defined $last_op){
1815 $last_op = $op;
1816 ++$count;
1817 next;
1818 }
1819 if ($last_op eq $op) {
1820 ++$count;
1821 } else {
1822 $new_cigar .= "$count$last_op";
1823 $last_op = $op;
1824 $count = 1;
1825 }
1826 }
1827 $new_cigar .= "$count$last_op"; # appending the last operation and count
1828 $cigar = $new_cigar;
1829 # print "ignore_3prime adjusted scalar: $cigar\n";
1830 }
1831 }
1832 ### printing out the methylation state of every C in the read
1833 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
1834
1835 ++$methylation_call_strings_processed; # 1 per single-end result
1836 }
1837 }
1838 }
1839
1840 ### PROCESSING PAIRED-END RESULT FILES
1841 elsif ($paired) {
1842
1843 ### proceeding differently now for SAM format or vanilla Bismark format files
1844 if ($vanilla) { # old vanilla Bismark paired-end output format
1845 while (<IN>) {
1846 ++$line_count;
1847 warn "processed line: $line_count\n" if ($line_count%500000 == 0);
1848
1849 if ( ($line_count - $offset)%$multicore == 0){
1850 # warn "line count: $line_count\noffset: $offset\n";
1851 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
1852 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
1853 }
1854 else{
1855 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
1856 next;
1857 }
1858
1859 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
1860 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
1861
1862 my $index;
1863 chomp $genome_conversion;
1864
1865 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
1866 $index = 0; ## this is OT
1867 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
1868 $index = 2; ## this is CTOB!!!
1869 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
1870 $index = 1; ## this is CTOT!!!
1871 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
1872 $index = 3; ## this is OB
1873 } else {
1874 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
1875 }
1876
1877 if ($meth_call_1 and $meth_call_2) {
1878 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
1879
1880 ### IGNORE FROM 5' END
1881 if ($ignore) {
1882 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
1883
1884 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
1885 $start_read_1 += $ignore;
1886 }
1887 if ($ignore_r2) {
1888 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
1889
1890 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified
1891 $end_read_2 -= $ignore_r2;
1892 }
1893
1894 ### IGNORE 3' END
1895
1896 ### Clipping off the last <int> number of bases from the methylation call string of Read 1 as specified with --ignore_3prime <int>
1897 if ($ignore_3prime) {
1898 $meth_call_1 = substr($meth_call_1,0, length($meth_call_1) - $ignore_3prime);
1899 # we don't have to adjust the position now since the shortened methylation call will be fine, see below
1900 }
1901 ### Clipping off the last <int> number of bases from the methylation call string of Read 2 as specified with --ignore_3prime_r2 <int>
1902 if ($ignore_3prime_r2) {
1903 $meth_call_2 = substr($meth_call_2,0, length($meth_call_2) - $ignore_3prime_r2);
1904 # we don't have to adjust the position now since the shortened methylation call will be fine, see below
1905 }
1906
1907 my $end_read_1;
1908 my $start_read_2;
1909
1910 if ($strand eq '+') {
1911
1912 $end_read_1 = $start_read_1 + length($meth_call_1) - 1;
1913 $start_read_2 = $end_read_2 - length($meth_call_2) + 1;
1914
1915 ## we first pass the first read which is in + orientation on the forward strand
1916 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity
1917
1918 # we next pass the second read which is in - orientation on the reverse strand
1919 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
1920 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2);
1921 }
1922 else {
1923
1924 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
1925 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
1926
1927 ## we first pass the first read which is in - orientation on the reverse strand
1928 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1);
1929
1930 # we next pass the second read which is in + orientation on the forward strand
1931 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
1932 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2);
1933 }
1934
1935 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
1936 }
1937 }
1938 }
1939 else { # Bismark paired-end BAM/SAM output format (default)
1940 while (<IN>) {
1941 chomp;
1942 ### SAM format can either start with header lines (starting with @) or start with alignments directly
1943 if (/^\@/) { # skipping header lines (starting with @)
1944 warn "skipping SAM header line:\t$_\n" unless ($process_id == 0); # no additional warnings for child procesess
1945 next;
1946 }
1947
1948 ++$line_count;
1949 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
1950
1951 if ( ($line_count - $offset)%$multicore == 0){
1952 # warn "line count: $line_count\noffset: $offset\n";
1953 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
1954 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
1955 }
1956 else{
1957 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
1958 $_ = <IN>; # reading and skipping another line since this is the paired-end read
1959 next;
1960 }
1961
1962 # example paired-end reads in SAM format (2 consecutive lines)
1963 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
1964 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
1965
1966 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
1967 my $meth_call_1;
1968 my $first_read_conversion;
1969 my $genome_conversion;
1970
1971 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
1972 my $tag = $1;
1973 my $value = $2;
1974
1975 if ($tag eq "XM") {
1976 $meth_call_1 = $value;
1977 $meth_call_1 =~ s/\r//;
1978 } elsif ($tag eq "XR") {
1979 $first_read_conversion = $value;
1980 $first_read_conversion =~ s/\r//;
1981 } elsif ($tag eq "XG") {
1982 $genome_conversion = $value;
1983 $genome_conversion =~ s/\r//;
1984 }
1985 }
1986
1987 $_ = <IN>; # reading in the paired read
1988 chomp;
1989
1990 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
1991
1992 my $meth_call_2;
1993 my $second_read_conversion;
1994
1995 while ( /(XM|XR):Z:([^\t]+)/g ) {
1996 my $tag = $1;
1997 my $value = $2;
1998
1999 if ($tag eq "XM") {
2000 $meth_call_2 = $value;
2001 $meth_call_2 =~ s/\r//;
2002 } elsif ($tag eq "XR") {
2003 $second_read_conversion = $value;
2004 $second_read_conversion = s/\r//;
2005 }
2006 }
2007
2008 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
2009 # chomp $genome_conversion; # in case it captured a new line character # already removed
2010
2011 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
2012
2013 my $index;
2014 my $strand;
2015
2016 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
2017 $index = 0; ## this is OT
2018 $strand = '+';
2019 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
2020 $index = 1; ## this is CTOT
2021 $strand = '-';
2022 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
2023 $index = 2; ## this is CTOB
2024 $strand = '+';
2025 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
2026 $index = 3; ## this is OB
2027 $strand = '-';
2028 } else {
2029 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
2030 }
2031
2032 ### reversing the methylation call of the read that was reverse-complemented
2033 if ($strand eq '+') {
2034 $meth_call_2 = reverse $meth_call_2;
2035 } else {
2036 $meth_call_1 = reverse $meth_call_1;
2037 }
2038 # warn "\n$meth_call_1\n$meth_call_2\n";
2039
2040 if ($meth_call_1 and $meth_call_2) {
2041
2042 my $end_read_1;
2043
2044 ### READ 1
2045 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
2046 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
2047 shift @ops_1; # remove the empty first element
2048
2049 die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1);
2050
2051 my @comp_cigar_1; # building an array with all CIGAR operations
2052 foreach my $index (0..$#len_1) {
2053 foreach (1..$len_1[$index]) {
2054 # print "$ops_1[$index]";
2055 push @comp_cigar_1, $ops_1[$index];
2056 }
2057 }
2058 # print "original CIGAR read 1: $cigar_1\n";
2059 # print "original CIGAR read 1: @comp_cigar_1\n";
2060
2061 ### READ 2
2062 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
2063 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
2064 shift @ops_2; # remove the empty first element
2065 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
2066 my @comp_cigar_2; # building an array with all CIGAR operations for read 2
2067 foreach my $index (0..$#len_2) {
2068 foreach (1..$len_2[$index]) {
2069 # print "$ops_2[$index]";
2070 push @comp_cigar_2, $ops_2[$index];
2071 }
2072 }
2073 # print "original CIGAR read 2: $cigar_2\n";
2074 # print "original CIGAR read 2: @comp_cigar_2\n";
2075
2076 ##################################
2077 ### IGNORE BASES FROM 5' END ###
2078 ##################################
2079
2080 if ($ignore) {
2081 ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore <int>' for read 1
2082 ### the methylation calls have already been reversed where necessary
2083
2084 if ( (length($meth_call_1) - $ignore) <= 0){
2085 # next; # skipping this read entirely if the read is shorter than the portion to be ignored
2086 $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored
2087 }
2088 else {
2089 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
2090
2091 if ($strand eq '+') {
2092
2093 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
2094 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
2095 my $I_count_1 = 0;
2096
2097 for (1..$ignore) {
2098 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
2099 # print "$_ deleted $op\n";
2100
2101 while ($op eq 'D') { # repeating this for deletions (D)
2102 $D_count_1++;
2103 $op = shift @comp_cigar_1;
2104 # print "$_ deleted $op\n";
2105 }
2106 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
2107 $I_count_1++;
2108 }
2109 }
2110
2111 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
2112 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
2113
2114 # the start position of reads mapping to the reverse strand is being adjusted further below
2115 }
2116 elsif ($strand eq '-') {
2117
2118 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
2119 for (1..$ignore) {
2120 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
2121 while ($op eq 'D') { # repeating this for deletions (D)
2122 $op = pop @comp_cigar_1;
2123 }
2124 }
2125 # the start position of reads mapping to the reverse strand is being adjusted further below
2126
2127 }
2128 }
2129 }
2130
2131 if ($ignore_r2) {
2132 ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore_r2 <int>' for read 2
2133 ### the methylation calls have already been reversed where necessary
2134
2135 if ( (length($meth_call_2) - $ignore_r2) <= 0){
2136 # next; # skipping this read entirely if the read is shorter than the portion to be ignored # this would skip the entire read pair!
2137 $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored
2138 }
2139 else {
2140 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
2141
2142 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
2143
2144 if ($strand eq '+') {
2145
2146 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
2147
2148 for (1..$ignore_r2) {
2149 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
2150 while ($op eq 'D') { # repeating this for deletions (D)
2151 $op = pop @comp_cigar_2;
2152 }
2153 }
2154 # the start position of reads mapping to the reverse strand is being adjusted further below
2155 }
2156 elsif ($strand eq '-') {
2157
2158 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
2159 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
2160 my $I_count_2 = 0;
2161
2162 for (1..$ignore_r2) {
2163 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
2164 # print "$_ deleted $op\n";
2165
2166 while ($op eq 'D') { # repeating this for deletions (D)
2167 $D_count_2++;
2168 $op = shift @comp_cigar_2;
2169 # print "$_ deleted $op\n";
2170 }
2171 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
2172 $I_count_2++;
2173 }
2174 }
2175
2176 $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2;
2177 # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
2178 }
2179 }
2180 }
2181
2182 if ($ignore and $meth_call_1){ # if the methylation call string is undefined at this position we don't need any new CIGAR string
2183
2184 ### reconstituting shortened CIGAR string 1
2185 my $new_cigar_1;
2186 my $count_1 = 0;
2187 my $last_op_1;
2188 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
2189 foreach my $op (@comp_cigar_1) {
2190 unless (defined $last_op_1){
2191 $last_op_1 = $op;
2192 ++$count_1;
2193 next;
2194 }
2195 if ($last_op_1 eq $op) {
2196 ++$count_1;
2197 } else {
2198 $new_cigar_1 .= "$count_1$last_op_1";
2199 $last_op_1 = $op;
2200 $count_1 = 1;
2201 }
2202 }
2203 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
2204 $cigar_1 = $new_cigar_1;
2205 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
2206 }
2207
2208 if ($ignore_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string
2209
2210 ### reconstituting shortened CIGAR string 2
2211 my $new_cigar_2;
2212 my $count_2 = 0;
2213 my $last_op_2;
2214 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
2215 foreach my $op (@comp_cigar_2) {
2216 unless (defined $last_op_2){
2217 $last_op_2 = $op;
2218 ++$count_2;
2219 next;
2220 }
2221 if ($last_op_2 eq $op) {
2222 ++$count_2;
2223 }
2224 else {
2225 $new_cigar_2 .= "$count_2$last_op_2";
2226 $last_op_2 = $op;
2227 $count_2 = 1;
2228 }
2229 }
2230 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
2231 $cigar_2 = $new_cigar_2;
2232 # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n";
2233 }
2234
2235 ###########################
2236 ### END IGNORE 5' END ###
2237 ###########################
2238
2239 ##################################
2240 ### IGNORE BASES FROM 3' END ###
2241 ##################################
2242
2243 # print "CIGAR string before truncating 3' end (Read 1)\n";
2244 # print join ("",@comp_cigar_1),"\n";
2245
2246 if ($ignore_3prime and $meth_call_1) { # if the methylation call string is undefined at this point we don't need to process the read any further
2247
2248 ### Clipping off the last <int> number of bases from the methylation call string as specified with '--ignore_3prime <int>' for read 1
2249 ### the methylation calls have already been reversed where necessary
2250
2251 if ( (length($meth_call_1) - $ignore_3prime) <= 0){
2252 $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored
2253 }
2254 else {
2255 $meth_call_1 = substr($meth_call_1,0,length($meth_call_1) - $ignore_3prime);
2256 # warn "truncated meth_call 1:\n$meth_call_1\n";
2257
2258 if ($strand eq '+') {
2259
2260 ### if the (read 1) strand information is '+', clipping the 3' end does not affect the starting position of forward strand alignments
2261 # ignore 5' has already been taken care of at this stage, if relevant at all
2262
2263 for (1..$ignore_3prime) {
2264 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore_3prime operations from the end
2265 # print "$_ deleted $op from 3' end\n";
2266 # print join ("",@comp_cigar_1),"\n";
2267
2268 while ($op eq 'D') { # repeating this for deletions (D)
2269 $op = pop @comp_cigar_1;
2270 # print join ("",@comp_cigar_1),"\n";
2271 # print "$_ deleted $op from 3' end\n";
2272 }
2273 }
2274
2275 # print "Final truncated CIGAR string (Read 1):\n";
2276 # print join ("",@comp_cigar_1),"\n";
2277
2278 }
2279 elsif ($strand eq '-') {
2280
2281 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
2282 my $I_count_1 = 0;
2283
2284 ### if the (read 1) strand information is '-', the read 1 CIGAR string needs to be trimmed from the start
2285 for (1..$ignore_3prime) {
2286 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array
2287 # print join ("",@comp_cigar_1),"\n";
2288
2289 while ($op eq 'D') { # repeating this for deletions (D)
2290 $D_count_1++;
2291 $op = shift @comp_cigar_1;
2292 # print join ("",@comp_cigar_1),"\n";
2293 }
2294
2295 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
2296 $I_count_1++;
2297 }
2298 }
2299
2300 # print "Final truncated CIGAR string reverse_read:\n";
2301 # print join ("",@comp_cigar_1),"\n";
2302
2303 # Here we need to add the length ignore_3prime to the read starting position
2304 # adjustment of the true start position of this reverse read will take place later in the methylation extraction step
2305 $start_read_1 += $ignore_3prime + $D_count_1 - $I_count_1;
2306
2307 }
2308 }
2309 }
2310
2311 if ($ignore_3prime_r2 and $meth_call_2) { # if the methylation call string is undefined at this point we don't need to process the read any further
2312
2313 ### Clipping off the last <int> number of bases from the methylation call string as specified with '--ignore_3prime_r2 <int>' for read 2
2314 ### the methylation calls have already been reversed where necessary
2315
2316 if ( (length($meth_call_2) - $ignore_3prime_r2) <= 0){
2317 $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored
2318 }
2319 else {
2320 $meth_call_2 = substr($meth_call_2,0,length($meth_call_2) - $ignore_3prime_r2);
2321 # warn "truncated meth_call 2:\n$meth_call_2\n";
2322
2323 ### If we are ignoring a part of the sequence we also need to adjust the cigar string and the positions accordingly
2324
2325 if ($strand eq '+') {
2326
2327 ### if the (read 1) strand information is '+', clipping the 3' end of read 2 does potentially affect the starting position of read 2 (reverse strand alignment)
2328 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the start
2329
2330 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
2331 my $I_count_2 = 0;
2332
2333 for (1..$ignore_3prime_r2) {
2334 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the first value of the array
2335 # print "$_ deleted $op from 3' end\n";
2336 # print join ("",@comp_cigar_2),"\n";
2337
2338 while ($op eq 'D') { # repeating this for deletions (D)
2339 $D_count_2++;
2340 $op = shift @comp_cigar_2;
2341 # print "$_ deleted $op from 3' end\n";
2342 # print join ("",@comp_cigar_2),"\n";
2343 }
2344
2345 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
2346 $I_count_2++;
2347 }
2348 }
2349
2350 # print "Final truncated CIGAR string 2 (+ alignment):\n";
2351 # print join ("",@comp_cigar_2),"\n";
2352
2353 # Here we need to add the length ignore_3prime_r2 to the read starting position
2354 # adjustment of the true start position of this reverse read will take place later in the methylation extraction step
2355 $start_read_2 += $ignore_3prime_r2 + $D_count_2 - $I_count_2;
2356
2357 # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
2358 }
2359 elsif ($strand eq '-') {
2360 ### if the (read 1) strand information is '-', clipping the 3' end of read 2 does not affect its starting position (forward strand alignment)
2361 ### ignore_r2 5' has already been taken care of at this stage, if relevant at all
2362
2363 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the end
2364
2365 for (1..$ignore_3prime_r2) {
2366 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
2367 # print "$_ deleted $op\n";
2368 # print join ("",@comp_cigar_2),"\n";
2369
2370 while ($op eq 'D') { # repeating this for deletions (D)
2371 $op = pop @comp_cigar_2;
2372 # print "$_ deleted $op\n";
2373 # print join ("",@comp_cigar_2),"\n";
2374 }
2375 }
2376
2377 # print "Final truncated CIGAR string 2 (- alignment):\n";
2378 # print join ("",@comp_cigar_2),"\n";
2379
2380 }
2381 }
2382 }
2383
2384 if ($ignore_3prime and $meth_call_1){ # if the methylation call string is undefined at this point we don't need any new CIGAR string
2385
2386 ### reconstituting shortened CIGAR string 1
2387 my $new_cigar_1;
2388 my $count_1 = 0;
2389 my $last_op_1;
2390 # print "ignore_3prime adjusted CIGAR 1: @comp_cigar_1\n";
2391 foreach my $op (@comp_cigar_1) {
2392 unless (defined $last_op_1){
2393 $last_op_1 = $op;
2394 ++$count_1;
2395 next;
2396 }
2397 if ($last_op_1 eq $op) {
2398 ++$count_1;
2399 }
2400 else {
2401 $new_cigar_1 .= "$count_1$last_op_1";
2402 $last_op_1 = $op;
2403 $count_1 = 1;
2404 }
2405 }
2406 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
2407 $cigar_1 = $new_cigar_1;
2408 # warn "ignore_3prime adjusted CIGAR 1 scalar: $cigar_1\n";
2409 }
2410
2411 if ($ignore_3prime_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string
2412
2413 ### reconstituting shortened CIGAR string 2
2414 my $new_cigar_2;
2415 my $count_2 = 0;
2416 my $last_op_2;
2417 # print "ignore_3prime_r2 adjusted CIGAR 2: @comp_cigar_2\n";
2418 foreach my $op (@comp_cigar_2) {
2419 unless (defined $last_op_2){
2420 $last_op_2 = $op;
2421 ++$count_2;
2422 next;
2423 }
2424 if ($last_op_2 eq $op) {
2425 ++$count_2;
2426 }
2427 else {
2428 $new_cigar_2 .= "$count_2$last_op_2";
2429 $last_op_2 = $op;
2430 $count_2 = 1;
2431 }
2432 }
2433 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
2434 $cigar_2 = $new_cigar_2;
2435 # warn "ignore_3prime_r2 adjusted CIGAR 2 scalar: $cigar_2\n";
2436 }
2437
2438 ###########################
2439 ### END IGNORE 3' END ###
2440 ###########################
2441
2442
2443 ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on
2444
2445 if ($strand eq '+') {
2446 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
2447 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2448 # print "reverse: @comp_cigar_2\n";
2449
2450 my $MD_count_1 = 0;
2451 foreach (@comp_cigar_1) {
2452 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
2453 }
2454
2455 my $MD_count_2 = 0;
2456 foreach (@comp_cigar_2) {
2457 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
2458 }
2459
2460 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
2461 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
2462 }
2463 else {
2464 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
2465
2466 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2467 # print "reverse: @comp_cigar_1\n";
2468
2469 my $MD_count_1 = 0;
2470 foreach (@comp_cigar_1) {
2471 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
2472 }
2473
2474 $end_read_1 = $start_read_1;
2475 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
2476 }
2477
2478 if ($strand eq '+') {
2479 ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity
2480 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1);
2481
2482 # we next pass the second read which is in - orientation on the reverse strand
2483 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
2484 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2);
2485 }
2486 else {
2487 ## we first pass the first read which is in - orientation on the reverse strand
2488 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1);
2489
2490 # we next pass the second read which is in + orientation on the forward strand
2491 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
2492 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2);
2493 }
2494
2495 $methylation_call_strings_processed += 2; # paired-end = 2 methylation call strings
2496 }
2497 }
2498 }
2499 } else {
2500 die "Single-end or paired-end reads not specified properly\n";
2501 }
2502
2503 $counting{sequences_count} = $line_count;
2504 $counting{methylation_call_strings} = $methylation_call_strings_processed;
2505
2506 if ($multicore == 1){
2507 print_splitting_report ();
2508 }
2509 elsif ($multicore > 1){
2510 print_splitting_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed);
2511 print_mbias_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed);
2512 }
2513
2514 return ($process_id,\@pids,$report_filename);
2515
2516 }
2517
2518
2519
2520 sub print_splitting_report{
2521
2522 ### Calculating methylation percentages if applicable
2523 warn "\nProcessed $counting{sequences_count} lines in total\n";
2524 warn "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n";
2525 if ($report) {
2526 print REPORT "\nProcessed $counting{sequences_count} lines in total\n";
2527 print REPORT "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n";
2528 }
2529
2530 my $percent_meCpG;
2531 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
2532 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
2533 }
2534
2535 my $percent_meCHG;
2536 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
2537 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
2538 }
2539
2540 my $percent_meCHH;
2541 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
2542 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
2543 }
2544
2545 my $percent_non_CpG_methylation;
2546 if ($merge_non_CpG){
2547 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
2548 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
2549 }
2550 }
2551
2552 if ($report){
2553 ### detailed information about Cs analysed
2554 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
2555
2556 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
2557 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
2558
2559 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
2560 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
2561 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
2562
2563 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
2564 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
2565 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
2566
2567 ### calculating methylated CpG percentage if applicable
2568 if ($percent_meCpG){
2569 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
2570 }
2571 else{
2572 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
2573 }
2574
2575 ### 2-Context Output
2576 if ($merge_non_CpG){
2577 if ($percent_non_CpG_methylation){
2578 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
2579 }
2580 else{
2581 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
2582 }
2583 }
2584
2585 ### 3 Context Output
2586 else{
2587 ### calculating methylated CHG percentage if applicable
2588 if ($percent_meCHG){
2589 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
2590 }
2591 else{
2592 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2593 }
2594
2595 ### calculating methylated CHH percentage if applicable
2596 if ($percent_meCHH){
2597 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
2598 }
2599 else{
2600 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
2601 }
2602 }
2603 }
2604
2605 ### detailed information about Cs analysed for on-screen report
2606 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
2607
2608 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
2609 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
2610
2611 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
2612 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
2613 warn"Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
2614
2615 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
2616 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
2617 warn"Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
2618
2619 ### printing methylated CpG percentage if applicable
2620 if ($percent_meCpG){
2621 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
2622 }
2623 else{
2624 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
2625 }
2626
2627 ### 2-Context Output
2628 if ($merge_non_CpG){
2629 if ($percent_non_CpG_methylation){
2630 warn "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
2631 }
2632 else{
2633 warn "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
2634 }
2635 }
2636
2637 ### 3-Context Output
2638 else{
2639 ### printing methylated CHG percentage if applicable
2640 if ($percent_meCHG){
2641 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
2642 }
2643 else{
2644 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2645 }
2646
2647 ### printing methylated CHH percentage if applicable
2648 if ($percent_meCHH){
2649 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
2650 }
2651 else{
2652 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
2653 }
2654 }
2655 }
2656
2657 ###
2658
2659 sub print_splitting_report_multicore{
2660
2661 my ($report_filename,$offset,$line_count,$meth_call_strings) = @_;
2662
2663 # warn "\$report_filename is $report_filename\n";
2664 my $special_report = $report_filename.".$offset";
2665
2666 open (SPECIAL_REPORT,'>',$special_report) or die $!;
2667 # warn "line count\t$line_count\n";
2668 # warn "meth call strings\t$meth_call_strings\n";
2669
2670 print SPECIAL_REPORT "line count\t$line_count\n";
2671 print SPECIAL_REPORT "meth call strings\t$meth_call_strings\n";
2672
2673 ### Calculating methylation percentages if applicable
2674 my $percent_meCpG;
2675 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
2676 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
2677 }
2678
2679 my $percent_meCHG;
2680 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
2681 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
2682 }
2683
2684 my $percent_meCHH;
2685 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
2686 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
2687 }
2688
2689 my $percent_non_CpG_methylation;
2690 if ($merge_non_CpG){
2691 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
2692 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
2693 }
2694 }
2695
2696 if ($report){
2697
2698 ### detailed information about Cs analysed
2699 print SPECIAL_REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
2700
2701 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
2702 print SPECIAL_REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
2703
2704 print SPECIAL_REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
2705 print SPECIAL_REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
2706 print SPECIAL_REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
2707
2708 print SPECIAL_REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
2709 print SPECIAL_REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
2710 print SPECIAL_REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
2711
2712 ### calculating methylated CpG percentage if applicable
2713 if ($percent_meCpG){
2714 print SPECIAL_REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
2715 }
2716 else{
2717 print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
2718 }
2719
2720 ### 2-Context Output
2721 if ($merge_non_CpG){
2722 if ($percent_non_CpG_methylation){
2723 print SPECIAL_REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
2724 }
2725 else{
2726 print SPECIAL_REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
2727 }
2728 }
2729
2730 ### 3 Context Output
2731 else{
2732 ### calculating methylated CHG percentage if applicable
2733 if ($percent_meCHG){
2734 print SPECIAL_REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
2735 }
2736 else{
2737 print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2738 }
2739
2740 ### calculating methylated CHH percentage if applicable
2741 if ($percent_meCHH){
2742 print SPECIAL_REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
2743 }
2744 else{
2745 print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
2746 }
2747 }
2748 }
2749 close SPECIAL_REPORT or warn "Failed to close filehandle for individual report $special_report\n";
2750 }
2751
2752
2753 ###
2754
2755 ### INDIVIDUAL M-BIAS REPORTS
2756
2757 sub print_mbias_report_multicore{
2758
2759 my ($report_filename,$offset,$line_count,$meth_call_strings) = @_;
2760
2761 # warn "\$report_filename is $report_filename\n";
2762 my $special_mbias_report = $report_filename.".${offset}.mbias";
2763
2764 open (SPECIAL_MBIAS,'>',$special_mbias_report) or die $!;
2765
2766 # determining maximum read length
2767 my $max_length_1 = 0;
2768 my $max_length_2 = 0;
2769
2770 foreach my $context (keys %mbias_1){
2771 foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){
2772 $max_length_1 = $pos unless ($max_length_1 >= $pos);
2773 }
2774 }
2775 if ($paired){
2776 foreach my $context (keys %mbias_2){
2777 foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){
2778 $max_length_2 = $pos unless ($max_length_2 >= $pos);
2779 }
2780 }
2781 }
2782
2783 if ($single){
2784 # warn "Determining maximum read length for M-Bias plot\n";
2785 # warn "Maximum read length of Read 1: $max_length_1\n\n";
2786 }
2787 else{
2788 # warn "Determining maximum read lengths for M-Bias plots\n";
2789 # warn "Maximum read length of Read 1: $max_length_1\n";
2790 # warn "Maximum read length of Read 2: $max_length_2\n\n";
2791 }
2792
2793 foreach my $context (qw(CpG CHG CHH)){
2794
2795 if ($paired){
2796 print SPECIAL_MBIAS "$context context (R1)\n================\n";
2797 }
2798 else{
2799 print SPECIAL_MBIAS "$context context\n===========\n";
2800 }
2801 print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
2802
2803 foreach my $pos (1..$max_length_1){
2804
2805 unless (defined $mbias_1{$context}->{$pos}->{meth}){
2806 $mbias_1{$context}->{$pos}->{meth} = 0;
2807 }
2808 unless (defined $mbias_1{$context}->{$pos}->{un}){
2809 $mbias_1{$context}->{$pos}->{un} = 0;
2810 }
2811
2812 my $percent = '';
2813 if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){
2814 $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) );
2815 }
2816 my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth};
2817
2818 print SPECIAL_MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n";
2819 }
2820 print SPECIAL_MBIAS "\n";
2821 }
2822
2823 if ($paired){
2824
2825 foreach my $context (qw(CpG CHG CHH)){
2826
2827 print SPECIAL_MBIAS "$context context (R2)\n================\n";
2828 print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
2829
2830 foreach my $pos (1..$max_length_2){
2831
2832 unless (defined $mbias_2{$context}->{$pos}->{meth}){
2833 $mbias_2{$context}->{$pos}->{meth} = 0;
2834 }
2835 unless (defined $mbias_2{$context}->{$pos}->{un}){
2836 $mbias_2{$context}->{$pos}->{un} = 0;
2837 }
2838
2839 my $percent = '';
2840 if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){
2841 $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) );
2842 }
2843 my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth};
2844
2845 print SPECIAL_MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n";
2846 }
2847 }
2848 }
2849
2850 close SPECIAL_MBIAS or warn "Failed to close filehandle for individual M-bias report $special_mbias_report\n";
2851 }
2852
2853
2854 ###
2855
2856
2857 sub print_individual_C_methylation_states_paired_end_files{
2858
2859 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_;
2860
2861 unless (defined $meth_call) {
2862 return; # skip this read
2863 }
2864
2865 ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2
2866 die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2);
2867
2868 my @methylation_calls = split(//,$meth_call);
2869
2870 #################################################################
2871 ### . for bases not involving cytosines ###
2872 ### X for methylated C in CHG context (was protected) ###
2873 ### x for not methylated C in CHG context (was converted) ###
2874 ### H for methylated C in CHH context (was protected) ###
2875 ### h for not methylated C in CHH context (was converted) ###
2876 ### Z for methylated C in CpG context (was protected) ###
2877 ### z for not methylated C in CpG context (was converted) ###
2878 ### U for methylated C in Unknown context (was protected) ###
2879 ### u for not methylated C in Unknown context (was converted) ###
2880 #################################################################
2881
2882 my $methyl_CHG_count = 0;
2883 my $methyl_CHH_count = 0;
2884 my $methyl_CpG_count = 0;
2885 my $unmethylated_CHG_count = 0;
2886 my $unmethylated_CHH_count = 0;
2887 my $unmethylated_CpG_count = 0;
2888
2889 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
2890 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
2891 my @comp_cigar;
2892
2893 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
2894 if ($cigar =~ /^\d+M$/){
2895 # this check speeds up the extraction process by up to 60%!!!
2896 }
2897 else{ # parsing CIGAR string
2898 my @len;
2899 my @ops;
2900 @len = split (/\D+/,$cigar); # storing the length per operation
2901 @ops = split (/\d+/,$cigar); # storing the operation
2902 shift @ops; # remove the empty first element
2903
2904 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
2905
2906 foreach my $index (0..$#len){
2907 foreach (1..$len[$index]){
2908 # print "$ops[$index]";
2909 push @comp_cigar, $ops[$index];
2910 }
2911 }
2912 # warn "\nDetected CIGAR string: $cigar\n";
2913 # warn "Length of methylation call: ",length $meth_call,"\n";
2914 # warn "number of operations: ",scalar @ops,"\n";
2915 # warn "number of length digits: ",scalar @len,"\n\n";
2916 # print @comp_cigar,"\n";
2917 # print "$meth_call\n\n";
2918 # sleep (1);
2919 }
2920
2921 if ($strand eq '-') {
2922
2923 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
2924 if (@comp_cigar){
2925 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2926 }
2927 # print "reverse CIGAR string: @comp_cigar\n";
2928
2929 ### the start position of paired-end files has already been corrected, see above
2930 }
2931
2932 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
2933
2934 if ($merge_non_CpG) {
2935 if ($no_overlap) { # this has to be read 2...
2936
2937 ### single-file CpG and non-CpG context output
2938 if ($full) {
2939 if ($strand eq '+') {
2940 for my $index (0..$#methylation_calls) {
2941
2942 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2943 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2944 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2945 $cigar_offset += $cigar_mod;
2946 $pos_offset += $pos_mod;
2947 }
2948
2949 ### Returning as soon as the methylation calls start overlapping
2950 if ($start+$index+$pos_offset >= $end_read_1) {
2951 return;
2952 }
2953
2954 if ($methylation_calls[$index] eq 'X') {
2955 $counting{total_meCHG_count}++;
2956 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
2957 if ($read_identity == 1){
2958 $mbias_1{CHG}->{$index+1}->{meth}++;
2959 }
2960 else{
2961 $mbias_2{CHG}->{$index+1}->{meth}++;
2962 }
2963 }
2964 elsif ($methylation_calls[$index] eq 'x') {
2965 $counting{total_unmethylated_CHG_count}++;
2966 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
2967 if ($read_identity == 1){
2968 $mbias_1{CHG}->{$index+1}->{un}++;
2969 }
2970 else{
2971 $mbias_2{CHG}->{$index+1}->{un}++;
2972 }
2973 }
2974 elsif ($methylation_calls[$index] eq 'Z') {
2975 $counting{total_meCpG_count}++;
2976 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
2977 if ($read_identity == 1){
2978 $mbias_1{CpG}->{$index+1}->{meth}++;
2979 }
2980 else{
2981 $mbias_2{CpG}->{$index+1}->{meth}++;
2982 }
2983 }
2984 elsif ($methylation_calls[$index] eq 'z') {
2985 $counting{total_unmethylated_CpG_count}++;
2986 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
2987 if ($read_identity == 1){
2988 $mbias_1{CpG}->{$index+1}->{un}++;
2989 }
2990 else{
2991 $mbias_2{CpG}->{$index+1}->{un}++;
2992 }
2993 }
2994 elsif ($methylation_calls[$index] eq 'H') {
2995 $counting{total_meCHH_count}++;
2996 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
2997 if ($read_identity == 1){
2998 $mbias_1{CHH}->{$index+1}->{meth}++;
2999 }
3000 else{
3001 $mbias_2{CHH}->{$index+1}->{meth}++;
3002 }
3003 }
3004 elsif ($methylation_calls[$index] eq 'h') {
3005 $counting{total_unmethylated_CHH_count}++;
3006 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3007 if ($read_identity == 1){
3008 $mbias_1{CHH}->{$index+1}->{un}++;
3009 }
3010 else{
3011 $mbias_2{CHH}->{$index+1}->{un}++;
3012 }
3013 }
3014 elsif ($methylation_calls[$index] eq '.'){}
3015 elsif (lc$methylation_calls[$index] eq 'u'){}
3016 else{
3017 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
3018 }
3019 }
3020 }
3021 elsif ($strand eq '-') {
3022 for my $index (0..$#methylation_calls) {
3023
3024 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3025 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3026 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3027 $cigar_offset += $cigar_mod;
3028 $pos_offset += $pos_mod;
3029 }
3030
3031 ### Returning as soon as the methylation calls start overlapping
3032 if ($start-$index+$pos_offset <= $end_read_1) {
3033 return;
3034 }
3035
3036 if ($methylation_calls[$index] eq 'X') {
3037 $counting{total_meCHG_count}++;
3038 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3039 if ($read_identity == 1){
3040 $mbias_1{CHG}->{$index+1}->{meth}++;
3041 }
3042 else{
3043 $mbias_2{CHG}->{$index+1}->{meth}++;
3044 }
3045 }
3046 elsif ($methylation_calls[$index] eq 'x') {
3047 $counting{total_unmethylated_CHG_count}++;
3048 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3049 if ($read_identity == 1){
3050 $mbias_1{CHG}->{$index+1}->{un}++;
3051 }
3052 else{
3053 $mbias_2{CHG}->{$index+1}->{un}++;
3054 }
3055 }
3056 elsif ($methylation_calls[$index] eq 'Z') {
3057 $counting{total_meCpG_count}++;
3058 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3059 if ($read_identity == 1){
3060 $mbias_1{CpG}->{$index+1}->{meth}++;
3061 }
3062 else{
3063 $mbias_2{CpG}->{$index+1}->{meth}++;
3064 }
3065 }
3066 elsif ($methylation_calls[$index] eq 'z') {
3067 $counting{total_unmethylated_CpG_count}++;
3068 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3069 if ($read_identity == 1){
3070 $mbias_1{CpG}->{$index+1}->{un}++;
3071 }
3072 else{
3073 $mbias_2{CpG}->{$index+1}->{un}++;
3074 }
3075 }
3076 elsif ($methylation_calls[$index] eq 'H') {
3077 $counting{total_meCHH_count}++;
3078 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3079 if ($read_identity == 1){
3080 $mbias_1{CHH}->{$index+1}->{meth}++;
3081 }
3082 else{
3083 $mbias_2{CHH}->{$index+1}->{meth}++;
3084 }
3085 }
3086 elsif ($methylation_calls[$index] eq 'h') {
3087 $counting{total_unmethylated_CHH_count}++;
3088 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3089 if ($read_identity == 1){
3090 $mbias_1{CHH}->{$index+1}->{un}++;
3091 }
3092 else{
3093 $mbias_2{CHH}->{$index+1}->{un}++;
3094 }
3095 }
3096 elsif ($methylation_calls[$index] eq '.') {}
3097 elsif (lc$methylation_calls[$index] eq 'u'){}
3098 else{
3099 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
3100 }
3101 }
3102 } else {
3103 die "The read orientation was neither + nor -: '$strand'\n";
3104 }
3105 }
3106
3107 ### strand-specific methylation output
3108 else {
3109 if ($strand eq '+') {
3110 for my $index (0..$#methylation_calls) {
3111
3112 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3113 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3114 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3115 $cigar_offset += $cigar_mod;
3116 $pos_offset += $pos_mod;
3117 }
3118
3119 ### Returning as soon as the methylation calls start overlapping
3120 if ($start+$index+$pos_offset >= $end_read_1) {
3121 return;
3122 }
3123
3124 if ($methylation_calls[$index] eq 'X') {
3125 $counting{total_meCHG_count}++;
3126 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3127 if ($read_identity == 1){
3128 $mbias_1{CHG}->{$index+1}->{meth}++;
3129 }
3130 else{
3131 $mbias_2{CHG}->{$index+1}->{meth}++;
3132 }
3133 }
3134 elsif ($methylation_calls[$index] eq 'x') {
3135 $counting{total_unmethylated_CHG_count}++;
3136 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3137 if ($read_identity == 1){
3138 $mbias_1{CHG}->{$index+1}->{un}++;
3139 }
3140 else{
3141 $mbias_2{CHG}->{$index+1}->{un}++;
3142 }
3143 }
3144 elsif ($methylation_calls[$index] eq 'Z') {
3145 $counting{total_meCpG_count}++;
3146 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3147 if ($read_identity == 1){
3148 $mbias_1{CpG}->{$index+1}->{meth}++;
3149 }
3150 else{
3151 $mbias_2{CpG}->{$index+1}->{meth}++;
3152 }
3153 }
3154 elsif ($methylation_calls[$index] eq 'z') {
3155 $counting{total_unmethylated_CpG_count}++;
3156 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3157 if ($read_identity == 1){
3158 $mbias_1{CpG}->{$index+1}->{un}++;
3159 }
3160 else{
3161 $mbias_2{CpG}->{$index+1}->{un}++;
3162 }
3163 }
3164 elsif ($methylation_calls[$index] eq 'H') {
3165 $counting{total_meCHH_count}++;
3166 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3167 if ($read_identity == 1){
3168 $mbias_1{CHH}->{$index+1}->{meth}++;
3169 }
3170 else{
3171 $mbias_2{CHH}->{$index+1}->{meth}++;
3172 }
3173 }
3174 elsif ($methylation_calls[$index] eq 'h') {
3175 $counting{total_unmethylated_CHH_count}++;
3176 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3177 if ($read_identity == 1){
3178 $mbias_1{CHH}->{$index+1}->{un}++;
3179 }
3180 else{
3181 $mbias_2{CHH}->{$index+1}->{un}++;
3182 }
3183 }
3184 elsif ($methylation_calls[$index] eq '.') {}
3185 elsif (lc$methylation_calls[$index] eq 'u'){}
3186 else{
3187 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3188 }
3189 }
3190 } elsif ($strand eq '-') {
3191 for my $index (0..$#methylation_calls) {
3192
3193 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3194 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3195 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3196 $cigar_offset += $cigar_mod;
3197 $pos_offset += $pos_mod;
3198 }
3199
3200 ### Returning as soon as the methylation calls start overlapping
3201 if ($start-$index+$pos_offset <= $end_read_1) {
3202 return;
3203 }
3204
3205 if ($methylation_calls[$index] eq 'X') {
3206 $counting{total_meCHG_count}++;
3207 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3208 if ($read_identity == 1){
3209 $mbias_1{CHG}->{$index+1}->{meth}++;
3210 }
3211 else{
3212 $mbias_2{CHG}->{$index+1}->{meth}++;
3213 }
3214 }
3215 elsif ($methylation_calls[$index] eq 'x') {
3216 $counting{total_unmethylated_CHG_count}++;
3217 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3218 if ($read_identity == 1){
3219 $mbias_1{CHG}->{$index+1}->{un}++;
3220 }
3221 else{
3222 $mbias_2{CHG}->{$index+1}->{un}++;
3223 }
3224 }
3225 elsif ($methylation_calls[$index] eq 'Z') {
3226 $counting{total_meCpG_count}++;
3227 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3228 if ($read_identity == 1){
3229 $mbias_1{CpG}->{$index+1}->{meth}++;
3230 }
3231 else{
3232 $mbias_2{CpG}->{$index+1}->{meth}++;
3233 }
3234 }
3235 elsif ($methylation_calls[$index] eq 'z') {
3236 $counting{total_unmethylated_CpG_count}++;
3237 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3238 if ($read_identity == 1){
3239 $mbias_1{CpG}->{$index+1}->{un}++;
3240 }
3241 else{
3242 $mbias_2{CpG}->{$index+1}->{un}++;
3243 }
3244 }
3245 elsif ($methylation_calls[$index] eq 'H') {
3246 $counting{total_meCHH_count}++;
3247 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3248 if ($read_identity == 1){
3249 $mbias_1{CHH}->{$index+1}->{meth}++;
3250 }
3251 else{
3252 $mbias_2{CHH}->{$index+1}->{meth}++;
3253 }
3254 }
3255 elsif ($methylation_calls[$index] eq 'h') {
3256 $counting{total_unmethylated_CHH_count}++;
3257 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3258 if ($read_identity == 1){
3259 $mbias_1{CHH}->{$index+1}->{un}++;
3260 }
3261 else{
3262 $mbias_2{CHH}->{$index+1}->{un}++;
3263 }
3264 }
3265 elsif ($methylation_calls[$index] eq '.') {}
3266 elsif (lc$methylation_calls[$index] eq 'u'){}
3267 else{
3268 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3269 }
3270 }
3271 } else {
3272 die "The strand orientation was neither + nor -: '$strand'/n";
3273 }
3274 }
3275 }
3276
3277 ### this is the default paired-end procedure allowing overlaps and using every single C position
3278 ### Still within the 2-CONTEXT ONLY optional section
3279 else {
3280 ### single-file CpG and non-CpG context output
3281 if ($full) {
3282 if ($strand eq '+') {
3283 for my $index (0..$#methylation_calls) {
3284
3285 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3286 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3287 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3288 $cigar_offset += $cigar_mod;
3289 $pos_offset += $pos_mod;
3290 }
3291
3292 if ($methylation_calls[$index] eq 'X') {
3293 $counting{total_meCHG_count}++;
3294 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3295 if ($read_identity == 1){
3296 $mbias_1{CHG}->{$index+1}->{meth}++;
3297 }
3298 else{
3299 $mbias_2{CHG}->{$index+1}->{meth}++;
3300 }
3301 }
3302 elsif ($methylation_calls[$index] eq 'x') {
3303 $counting{total_unmethylated_CHG_count}++;
3304 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3305 if ($read_identity == 1){
3306 $mbias_1{CHG}->{$index+1}->{un}++;
3307 }
3308 else{
3309 $mbias_2{CHG}->{$index+1}->{un}++;
3310 }
3311 }
3312 elsif ($methylation_calls[$index] eq 'Z') {
3313 $counting{total_meCpG_count}++;
3314 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3315 if ($read_identity == 1){
3316 $mbias_1{CpG}->{$index+1}->{meth}++;
3317 }
3318 else{
3319 $mbias_2{CpG}->{$index+1}->{meth}++;
3320 }
3321 }
3322 elsif ($methylation_calls[$index] eq 'z') {
3323 $counting{total_unmethylated_CpG_count}++;
3324 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3325 if ($read_identity == 1){
3326 $mbias_1{CpG}->{$index+1}->{un}++;
3327 }
3328 else{
3329 $mbias_2{CpG}->{$index+1}->{un}++;
3330 }
3331 }
3332 elsif ($methylation_calls[$index] eq 'H') {
3333 $counting{total_meCHH_count}++;
3334 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3335 if ($read_identity == 1){
3336 $mbias_1{CHH}->{$index+1}->{meth}++;
3337 }
3338 else{
3339 $mbias_2{CHH}->{$index+1}->{meth}++;
3340 }
3341 }
3342 elsif ($methylation_calls[$index] eq 'h') {
3343 $counting{total_unmethylated_CHH_count}++;
3344 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3345 if ($read_identity == 1){
3346 $mbias_1{CHH}->{$index+1}->{un}++;
3347 }
3348 else{
3349 $mbias_2{CHH}->{$index+1}->{un}++;
3350 }
3351 }
3352 elsif ($methylation_calls[$index] eq '.') {}
3353 elsif (lc$methylation_calls[$index] eq 'u'){}
3354 else{
3355 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
3356 }
3357 }
3358 } elsif ($strand eq '-') {
3359 for my $index (0..$#methylation_calls) {
3360
3361 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3362 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3363 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3364 $cigar_offset += $cigar_mod;
3365 $pos_offset += $pos_mod;
3366 }
3367
3368 if ($methylation_calls[$index] eq 'X') {
3369 $counting{total_meCHG_count}++;
3370 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3371 if ($read_identity == 1){
3372 $mbias_1{CHG}->{$index+1}->{meth}++;
3373 }
3374 else{
3375 $mbias_2{CHG}->{$index+1}->{meth}++;
3376 }
3377 }
3378 elsif ($methylation_calls[$index] eq 'x') {
3379 $counting{total_unmethylated_CHG_count}++;
3380 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3381 if ($read_identity == 1){
3382 $mbias_1{CHG}->{$index+1}->{un}++;
3383 }
3384 else{
3385 $mbias_2{CHG}->{$index+1}->{un}++;
3386 }
3387 }
3388 elsif ($methylation_calls[$index] eq 'Z') {
3389 $counting{total_meCpG_count}++;
3390 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3391 if ($read_identity == 1){
3392 $mbias_1{CpG}->{$index+1}->{meth}++;
3393 }
3394 else{
3395 $mbias_2{CpG}->{$index+1}->{meth}++;
3396 }
3397 }
3398 elsif ($methylation_calls[$index] eq 'z') {
3399 $counting{total_unmethylated_CpG_count}++;
3400 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3401 if ($read_identity == 1){
3402 $mbias_1{CpG}->{$index+1}->{un}++;
3403 }
3404 else{
3405 $mbias_2{CpG}->{$index+1}->{un}++;
3406 }
3407 }
3408 elsif ($methylation_calls[$index] eq 'H') {
3409 $counting{total_meCHH_count}++;
3410 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3411 if ($read_identity == 1){
3412 $mbias_1{CHH}->{$index+1}->{meth}++;
3413 }
3414 else{
3415 $mbias_2{CHH}->{$index+1}->{meth}++;
3416 }
3417 }
3418 elsif ($methylation_calls[$index] eq 'h') {
3419 $counting{total_unmethylated_CHH_count}++;
3420 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3421 if ($read_identity == 1){
3422 $mbias_1{CHH}->{$index+1}->{un}++;
3423 }
3424 else{
3425 $mbias_2{CHH}->{$index+1}->{un}++;
3426 }
3427 }
3428 elsif ($methylation_calls[$index] eq '.') {}
3429 elsif (lc$methylation_calls[$index] eq 'u'){}
3430 else{
3431 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
3432 }
3433 }
3434 } else {
3435 die "The strand orientation as neither + nor -: '$strand'\n";
3436 }
3437 }
3438
3439 ### strand-specific methylation output
3440 ### still within the 2-CONTEXT optional section
3441 else {
3442 if ($strand eq '+') {
3443 for my $index (0..$#methylation_calls) {
3444
3445 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3446 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3447 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3448 $cigar_offset += $cigar_mod;
3449 $pos_offset += $pos_mod;
3450 }
3451
3452 if ($methylation_calls[$index] eq 'X') {
3453 $counting{total_meCHG_count}++;
3454 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3455 if ($read_identity == 1){
3456 $mbias_1{CHG}->{$index+1}->{meth}++;
3457 }
3458 else{
3459 $mbias_2{CHG}->{$index+1}->{meth}++;
3460 }
3461 }
3462 elsif ($methylation_calls[$index] eq 'x') {
3463 $counting{total_unmethylated_CHG_count}++;
3464 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3465 if ($read_identity == 1){
3466 $mbias_1{CHG}->{$index+1}->{un}++;
3467 }
3468 else{
3469 $mbias_2{CHG}->{$index+1}->{un}++;
3470 }
3471 }
3472 elsif ($methylation_calls[$index] eq 'Z') {
3473 $counting{total_meCpG_count}++;
3474 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3475 if ($read_identity == 1){
3476 $mbias_1{CpG}->{$index+1}->{meth}++;
3477 }
3478 else{
3479 $mbias_2{CpG}->{$index+1}->{meth}++;
3480 }
3481 }
3482 elsif ($methylation_calls[$index] eq 'z') {
3483 $counting{total_unmethylated_CpG_count}++;
3484 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3485 if ($read_identity == 1){
3486 $mbias_1{CpG}->{$index+1}->{un}++;
3487 }
3488 else{
3489 $mbias_2{CpG}->{$index+1}->{un}++;
3490 }
3491 }
3492 elsif ($methylation_calls[$index] eq 'H') {
3493 $counting{total_meCHH_count}++;
3494 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3495 if ($read_identity == 1){
3496 $mbias_1{CHH}->{$index+1}->{meth}++;
3497 }
3498 else{
3499 $mbias_2{CHH}->{$index+1}->{meth}++;
3500 }
3501 }
3502 elsif ($methylation_calls[$index] eq 'h') {
3503 $counting{total_unmethylated_CHH_count}++;
3504 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3505 if ($read_identity == 1){
3506 $mbias_1{CHH}->{$index+1}->{un}++;
3507 }
3508 else{
3509 $mbias_2{CHH}->{$index+1}->{un}++;
3510 }
3511 }
3512 elsif ($methylation_calls[$index] eq '.') {}
3513 elsif (lc$methylation_calls[$index] eq 'u'){}
3514 else{
3515 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3516 }
3517 }
3518 } elsif ($strand eq '-') {
3519 for my $index (0..$#methylation_calls) {
3520
3521 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3522 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3523 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3524 $cigar_offset += $cigar_mod;
3525 $pos_offset += $pos_mod;
3526 }
3527
3528 if ($methylation_calls[$index] eq 'X') {
3529 $counting{total_meCHG_count}++;
3530 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3531 if ($read_identity == 1){
3532 $mbias_1{CHG}->{$index+1}->{meth}++;
3533 }
3534 else{
3535 $mbias_2{CHG}->{$index+1}->{meth}++;
3536 }
3537 }
3538 elsif ($methylation_calls[$index] eq 'x') {
3539 $counting{total_unmethylated_CHG_count}++;
3540 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3541 if ($read_identity == 1){
3542 $mbias_1{CHG}->{$index+1}->{un}++;
3543 }
3544 else{
3545 $mbias_2{CHG}->{$index+1}->{un}++;
3546 }
3547 }
3548 elsif ($methylation_calls[$index] eq 'Z') {
3549 $counting{total_meCpG_count}++;
3550 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3551 if ($read_identity == 1){
3552 $mbias_1{CpG}->{$index+1}->{meth}++;
3553 }
3554 else{
3555 $mbias_2{CpG}->{$index+1}->{meth}++;
3556 }
3557 }
3558 elsif ($methylation_calls[$index] eq 'z') {
3559 $counting{total_unmethylated_CpG_count}++;
3560 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3561 if ($read_identity == 1){
3562 $mbias_1{CpG}->{$index+1}->{un}++;
3563 }
3564 else{
3565 $mbias_2{CpG}->{$index+1}->{un}++;
3566 }
3567 }
3568 elsif ($methylation_calls[$index] eq 'H') {
3569 $counting{total_meCHH_count}++;
3570 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3571 if ($read_identity == 1){
3572 $mbias_1{CHH}->{$index+1}->{meth}++;
3573 }
3574 else{
3575 $mbias_2{CHH}->{$index+1}->{meth}++;
3576 }
3577 }
3578 elsif ($methylation_calls[$index] eq 'h') {
3579 $counting{total_unmethylated_CHH_count}++;
3580 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3581 if ($read_identity == 1){
3582 $mbias_1{CHH}->{$index+1}->{un}++;
3583 }
3584 else{
3585 $mbias_2{CHH}->{$index+1}->{un}++;
3586 }
3587 }
3588 elsif ($methylation_calls[$index] eq '.') {}
3589 elsif (lc$methylation_calls[$index] eq 'u'){}
3590 else{
3591 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3592 }
3593 }
3594 } else {
3595 die "The strand orientation as neither + nor -: '$strand'\n";
3596 }
3597 }
3598 }
3599 }
3600
3601 ############################################
3602 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
3603 ############################################
3604
3605 elsif ($no_overlap) {
3606 ### single-file CpG, CHG and CHH context output
3607 if ($full) {
3608 if ($strand eq '+') {
3609 for my $index (0..$#methylation_calls) {
3610
3611 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3612 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3613 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3614 $cigar_offset += $cigar_mod;
3615 $pos_offset += $pos_mod;
3616 }
3617
3618 ### Returning as soon as the methylation calls start overlapping
3619 if ($start+$index+$pos_offset >= $end_read_1) {
3620 return;
3621 }
3622
3623 if ($methylation_calls[$index] eq 'X') {
3624 $counting{total_meCHG_count}++;
3625 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3626 if ($read_identity == 1){
3627 $mbias_1{CHG}->{$index+1}->{meth}++;
3628 }
3629 else{
3630 $mbias_2{CHG}->{$index+1}->{meth}++;
3631 }
3632 }
3633 elsif ($methylation_calls[$index] eq 'x') {
3634 $counting{total_unmethylated_CHG_count}++;
3635 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3636 if ($read_identity == 1){
3637 $mbias_1{CHG}->{$index+1}->{un}++;
3638 }
3639 else{
3640 $mbias_2{CHG}->{$index+1}->{un}++;
3641 }
3642 }
3643 elsif ($methylation_calls[$index] eq 'Z') {
3644 $counting{total_meCpG_count}++;
3645 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3646 if ($read_identity == 1){
3647 $mbias_1{CpG}->{$index+1}->{meth}++;
3648 }
3649 else{
3650 $mbias_2{CpG}->{$index+1}->{meth}++;
3651 }
3652 }
3653 elsif ($methylation_calls[$index] eq 'z') {
3654 $counting{total_unmethylated_CpG_count}++;
3655 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3656 if ($read_identity == 1){
3657 $mbias_1{CpG}->{$index+1}->{un}++;
3658 }
3659 else{
3660 $mbias_2{CpG}->{$index+1}->{un}++;
3661 }
3662 }
3663 elsif ($methylation_calls[$index] eq 'H') {
3664 $counting{total_meCHH_count}++;
3665 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3666 if ($read_identity == 1){
3667 $mbias_1{CHH}->{$index+1}->{meth}++;
3668 }
3669 else{
3670 $mbias_2{CHH}->{$index+1}->{meth}++;
3671 }
3672 }
3673 elsif ($methylation_calls[$index] eq 'h') {
3674 $counting{total_unmethylated_CHH_count}++;
3675 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3676 if ($read_identity == 1){
3677 $mbias_1{CHH}->{$index+1}->{un}++;
3678 }
3679 else{
3680 $mbias_2{CHH}->{$index+1}->{un}++;
3681 }
3682 }
3683 elsif ($methylation_calls[$index] eq '.') {}
3684 elsif (lc$methylation_calls[$index] eq 'u'){}
3685 else{
3686 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3687 }
3688 }
3689 } elsif ($strand eq '-') {
3690 for my $index (0..$#methylation_calls) {
3691
3692 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3693 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3694 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3695 $cigar_offset += $cigar_mod;
3696 $pos_offset += $pos_mod;
3697 }
3698
3699 ### Returning as soon as the methylation calls start overlapping
3700 if ($start-$index+$pos_offset <= $end_read_1) {
3701 return;
3702 }
3703
3704 if ($methylation_calls[$index] eq 'X') {
3705 $counting{total_meCHG_count}++;
3706 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3707 if ($read_identity == 1){
3708 $mbias_1{CHG}->{$index+1}->{meth}++;
3709 }
3710 else{
3711 $mbias_2{CHG}->{$index+1}->{meth}++;
3712 }
3713 }
3714 elsif ($methylation_calls[$index] eq 'x') {
3715 $counting{total_unmethylated_CHG_count}++;
3716 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3717 if ($read_identity == 1){
3718 $mbias_1{CHG}->{$index+1}->{un}++;
3719 }
3720 else{
3721 $mbias_2{CHG}->{$index+1}->{un}++;
3722 }
3723 }
3724 elsif ($methylation_calls[$index] eq 'Z') {
3725 $counting{total_meCpG_count}++;
3726 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3727 if ($read_identity == 1){
3728 $mbias_1{CpG}->{$index+1}->{meth}++;
3729 }
3730 else{
3731 $mbias_2{CpG}->{$index+1}->{meth}++;
3732 }
3733 }
3734 elsif ($methylation_calls[$index] eq 'z') {
3735 $counting{total_unmethylated_CpG_count}++;
3736 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3737 if ($read_identity == 1){
3738 $mbias_1{CpG}->{$index+1}->{un}++;
3739 }
3740 else{
3741 $mbias_2{CpG}->{$index+1}->{un}++;
3742 }
3743 }
3744 elsif ($methylation_calls[$index] eq 'H') {
3745 $counting{total_meCHH_count}++;
3746 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3747 if ($read_identity == 1){
3748 $mbias_1{CHH}->{$index+1}->{meth}++;
3749 }
3750 else{
3751 $mbias_2{CHH}->{$index+1}->{meth}++;
3752 }
3753 }
3754 elsif ($methylation_calls[$index] eq 'h') {
3755 $counting{total_unmethylated_CHH_count}++;
3756 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3757 if ($read_identity == 1){
3758 $mbias_1{CHH}->{$index+1}->{un}++;
3759 }
3760 else{
3761 $mbias_2{CHH}->{$index+1}->{un}++;
3762 }
3763 }
3764 elsif ($methylation_calls[$index] eq '.') {}
3765 elsif (lc$methylation_calls[$index] eq 'u'){}
3766 else{
3767 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3768 }
3769 }
3770 } else {
3771 die "The strand orientation as neither + nor -: '$strand'\n";
3772 }
3773 }
3774
3775 ### strand-specific methylation output
3776 else {
3777 if ($strand eq '+') {
3778 for my $index (0..$#methylation_calls) {
3779
3780 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3781 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3782 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3783 $cigar_offset += $cigar_mod;
3784 $pos_offset += $pos_mod;
3785 }
3786
3787 ### Returning as soon as the methylation calls start overlapping
3788 if ($start+$index+$pos_offset >= $end_read_1) {
3789 return;
3790 }
3791
3792 if ($methylation_calls[$index] eq 'X') {
3793 $counting{total_meCHG_count}++;
3794 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3795 if ($read_identity == 1){
3796 $mbias_1{CHG}->{$index+1}->{meth}++;
3797 }
3798 else{
3799 $mbias_2{CHG}->{$index+1}->{meth}++;
3800 }
3801 }
3802 elsif ($methylation_calls[$index] eq 'x') {
3803 $counting{total_unmethylated_CHG_count}++;
3804 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3805 if ($read_identity == 1){
3806 $mbias_1{CHG}->{$index+1}->{un}++;
3807 }
3808 else{
3809 $mbias_2{CHG}->{$index+1}->{un}++;
3810 }
3811 }
3812 elsif ($methylation_calls[$index] eq 'Z') {
3813 $counting{total_meCpG_count}++;
3814 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3815 if ($read_identity == 1){
3816 $mbias_1{CpG}->{$index+1}->{meth}++;
3817 }
3818 else{
3819 $mbias_2{CpG}->{$index+1}->{meth}++;
3820 }
3821 }
3822 elsif ($methylation_calls[$index] eq 'z') {
3823 $counting{total_unmethylated_CpG_count}++;
3824 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3825 if ($read_identity == 1){
3826 $mbias_1{CpG}->{$index+1}->{un}++;
3827 }
3828 else{
3829 $mbias_2{CpG}->{$index+1}->{un}++;
3830 }
3831 }
3832 elsif ($methylation_calls[$index] eq 'H') {
3833 $counting{total_meCHH_count}++;
3834 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3835 if ($read_identity == 1){
3836 $mbias_1{CHH}->{$index+1}->{meth}++;
3837 }
3838 else{
3839 $mbias_2{CHH}->{$index+1}->{meth}++;
3840 }
3841 }
3842 elsif ($methylation_calls[$index] eq 'h') {
3843 $counting{total_unmethylated_CHH_count}++;
3844 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3845 if ($read_identity == 1){
3846 $mbias_1{CHH}->{$index+1}->{un}++;
3847 }
3848 else{
3849 $mbias_2{CHH}->{$index+1}->{un}++;
3850 }
3851 }
3852 elsif ($methylation_calls[$index] eq '.') {}
3853 elsif (lc$methylation_calls[$index] eq 'u'){}
3854 else{
3855 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3856 }
3857 }
3858 } elsif ($strand eq '-') {
3859 for my $index (0..$#methylation_calls) {
3860
3861 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3862 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3863 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3864 $cigar_offset += $cigar_mod;
3865 $pos_offset += $pos_mod;
3866 }
3867
3868 ### Returning as soon as the methylation calls start overlapping
3869 if ($start-$index+$pos_offset <= $end_read_1) {
3870 return;
3871 }
3872
3873 if ($methylation_calls[$index] eq 'X') {
3874 $counting{total_meCHG_count}++;
3875 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3876 if ($read_identity == 1){
3877 $mbias_1{CHG}->{$index+1}->{meth}++;
3878 }
3879 else{
3880 $mbias_2{CHG}->{$index+1}->{meth}++;
3881 }
3882 }
3883 elsif ($methylation_calls[$index] eq 'x') {
3884 $counting{total_unmethylated_CHG_count}++;
3885 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3886 if ($read_identity == 1){
3887 $mbias_1{CHG}->{$index+1}->{un}++;
3888 }
3889 else{
3890 $mbias_2{CHG}->{$index+1}->{un}++;
3891 }
3892 }
3893 elsif ($methylation_calls[$index] eq 'Z') {
3894 $counting{total_meCpG_count}++;
3895 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3896 if ($read_identity == 1){
3897 $mbias_1{CpG}->{$index+1}->{meth}++;
3898 }
3899 else{
3900 $mbias_2{CpG}->{$index+1}->{meth}++;
3901 }
3902 }
3903 elsif ($methylation_calls[$index] eq 'z') {
3904 $counting{total_unmethylated_CpG_count}++;
3905 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3906 if ($read_identity == 1){
3907 $mbias_1{CpG}->{$index+1}->{un}++;
3908 }
3909 else{
3910 $mbias_2{CpG}->{$index+1}->{un}++;
3911 }
3912 }
3913 elsif ($methylation_calls[$index] eq 'H') {
3914 $counting{total_meCHH_count}++;
3915 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3916 if ($read_identity == 1){
3917 $mbias_1{CHH}->{$index+1}->{meth}++;
3918 }
3919 else{
3920 $mbias_2{CHH}->{$index+1}->{meth}++;
3921 }
3922 }
3923 elsif ($methylation_calls[$index] eq 'h') {
3924 $counting{total_unmethylated_CHH_count}++;
3925 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3926 if ($read_identity == 1){
3927 $mbias_1{CHH}->{$index+1}->{un}++;
3928 }
3929 else{
3930 $mbias_2{CHH}->{$index+1}->{un}++;
3931 }
3932 }
3933 elsif ($methylation_calls[$index] eq '.') {}
3934 elsif (lc$methylation_calls[$index] eq 'u'){}
3935 else{
3936 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3937 }
3938 }
3939 } else {
3940 die "The strand orientation as neither + nor -: '$strand'\n";
3941 }
3942 }
3943 }
3944
3945 ### this is the paired-end procedure allowing overlaps and using every single C position
3946 else {
3947 ### single-file CpG, CHG and CHH context output
3948 if ($full) {
3949 if ($strand eq '+') {
3950 for my $index (0..$#methylation_calls) {
3951
3952 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3953 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3954 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3955 $cigar_offset += $cigar_mod;
3956 $pos_offset += $pos_mod;
3957 }
3958
3959 if ($methylation_calls[$index] eq 'X') {
3960 $counting{total_meCHG_count}++;
3961 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3962 if ($read_identity == 1){
3963 $mbias_1{CHG}->{$index+1}->{meth}++;
3964 }
3965 else{
3966 $mbias_2{CHG}->{$index+1}->{meth}++;
3967 }
3968 }
3969 elsif ($methylation_calls[$index] eq 'x') {
3970 $counting{total_unmethylated_CHG_count}++;
3971 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3972 if ($read_identity == 1){
3973 $mbias_1{CHG}->{$index+1}->{un}++;
3974 }
3975 else{
3976 $mbias_2{CHG}->{$index+1}->{un}++;
3977 }
3978 }
3979 elsif ($methylation_calls[$index] eq 'Z') {
3980 $counting{total_meCpG_count}++;
3981 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3982 if ($read_identity == 1){
3983 $mbias_1{CpG}->{$index+1}->{meth}++;
3984 }
3985 else{
3986 $mbias_2{CpG}->{$index+1}->{meth}++;
3987 }
3988 }
3989 elsif ($methylation_calls[$index] eq 'z') {
3990 $counting{total_unmethylated_CpG_count}++;
3991 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3992 if ($read_identity == 1){
3993 $mbias_1{CpG}->{$index+1}->{un}++;
3994 }
3995 else{
3996 $mbias_2{CpG}->{$index+1}->{un}++;
3997 }
3998 }
3999 elsif ($methylation_calls[$index] eq 'H') {
4000 $counting{total_meCHH_count}++;
4001 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4002 if ($read_identity == 1){
4003 $mbias_1{CHH}->{$index+1}->{meth}++;
4004 }
4005 else{
4006 $mbias_2{CHH}->{$index+1}->{meth}++;
4007 }
4008 }
4009 elsif ($methylation_calls[$index] eq 'h') {
4010 $counting{total_unmethylated_CHH_count}++;
4011 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4012 if ($read_identity == 1){
4013 $mbias_1{CHH}->{$index+1}->{un}++;
4014 }
4015 else{
4016 $mbias_2{CHH}->{$index+1}->{un}++;
4017 }
4018 }
4019 elsif ($methylation_calls[$index] eq '.') {}
4020 elsif (lc$methylation_calls[$index] eq 'u'){}
4021 else{
4022 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4023 }
4024 }
4025 } elsif ($strand eq '-') {
4026 for my $index (0..$#methylation_calls) {
4027
4028 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
4029 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
4030 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4031 $cigar_offset += $cigar_mod;
4032 $pos_offset += $pos_mod;
4033 }
4034
4035 if ($methylation_calls[$index] eq 'X') {
4036 $counting{total_meCHG_count}++;
4037 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4038 if ($read_identity == 1){
4039 $mbias_1{CHG}->{$index+1}->{meth}++;
4040 }
4041 else{
4042 $mbias_2{CHG}->{$index+1}->{meth}++;
4043 }
4044 }
4045 elsif ($methylation_calls[$index] eq 'x') {
4046 $counting{total_unmethylated_CHG_count}++;
4047 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4048 if ($read_identity == 1){
4049 $mbias_1{CHG}->{$index+1}->{un}++;
4050 }
4051 else{
4052 $mbias_2{CHG}->{$index+1}->{un}++;
4053 }
4054 }
4055 elsif ($methylation_calls[$index] eq 'Z') {
4056 $counting{total_meCpG_count}++;
4057 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4058 if ($read_identity == 1){
4059 $mbias_1{CpG}->{$index+1}->{meth}++;
4060 }
4061 else{
4062 $mbias_2{CpG}->{$index+1}->{meth}++;
4063 }
4064 }
4065 elsif ($methylation_calls[$index] eq 'z') {
4066 $counting{total_unmethylated_CpG_count}++;
4067 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4068 if ($read_identity == 1){
4069 $mbias_1{CpG}->{$index+1}->{un}++;
4070 }
4071 else{
4072 $mbias_2{CpG}->{$index+1}->{un}++;
4073 }
4074 }
4075 elsif ($methylation_calls[$index] eq 'H') {
4076 $counting{total_meCHH_count}++;
4077 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4078 if ($read_identity == 1){
4079 $mbias_1{CHH}->{$index+1}->{meth}++;
4080 }
4081 else{
4082 $mbias_2{CHH}->{$index+1}->{meth}++;
4083 }
4084 }
4085 elsif ($methylation_calls[$index] eq 'h') {
4086 $counting{total_unmethylated_CHH_count}++;
4087 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4088 if ($read_identity == 1){
4089 $mbias_1{CHH}->{$index+1}->{un}++;
4090 }
4091 else{
4092 $mbias_2{CHH}->{$index+1}->{un}++;
4093 }
4094 }
4095 elsif ($methylation_calls[$index] eq '.') {}
4096 elsif (lc$methylation_calls[$index] eq 'u'){}
4097 else{
4098 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4099 }
4100 }
4101 } else {
4102 die "The strand orientation as neither + nor -: '$strand'\n";
4103 }
4104 }
4105
4106 ### strand-specific methylation output
4107 else {
4108 if ($strand eq '+') {
4109 for my $index (0..$#methylation_calls) {
4110
4111 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
4112 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4113 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
4114 $cigar_offset += $cigar_mod;
4115 $pos_offset += $pos_mod;
4116 }
4117
4118 if ($methylation_calls[$index] eq 'X') {
4119 $counting{total_meCHG_count}++;
4120 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4121 if ($read_identity == 1){
4122 $mbias_1{CHG}->{$index+1}->{meth}++;
4123 }
4124 else{
4125 $mbias_2{CHG}->{$index+1}->{meth}++;
4126 }
4127 }
4128 elsif ($methylation_calls[$index] eq 'x') {
4129 $counting{total_unmethylated_CHG_count}++;
4130 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4131 if ($read_identity == 1){
4132 $mbias_1{CHG}->{$index+1}->{un}++;
4133 }
4134 else{
4135 $mbias_2{CHG}->{$index+1}->{un}++;
4136 }
4137 }
4138 elsif ($methylation_calls[$index] eq 'Z') {
4139 $counting{total_meCpG_count}++;
4140 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4141 if ($read_identity == 1){
4142 $mbias_1{CpG}->{$index+1}->{meth}++;
4143 }
4144 else{
4145 $mbias_2{CpG}->{$index+1}->{meth}++;
4146 }
4147 }
4148 elsif ($methylation_calls[$index] eq 'z') {
4149 $counting{total_unmethylated_CpG_count}++;
4150 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4151 if ($read_identity == 1){
4152 $mbias_1{CpG}->{$index+1}->{un}++;
4153 }
4154 else{
4155 $mbias_2{CpG}->{$index+1}->{un}++;
4156 }
4157 }
4158 elsif ($methylation_calls[$index] eq 'H') {
4159 $counting{total_meCHH_count}++;
4160 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4161 if ($read_identity == 1){
4162 $mbias_1{CHH}->{$index+1}->{meth}++;
4163 }
4164 else{
4165 $mbias_2{CHH}->{$index+1}->{meth}++;
4166 }
4167 }
4168 elsif ($methylation_calls[$index] eq 'h') {
4169 $counting{total_unmethylated_CHH_count}++;
4170 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4171 if ($read_identity == 1){
4172 $mbias_1{CHH}->{$index+1}->{un}++;
4173 }
4174 else{
4175 $mbias_2{CHH}->{$index+1}->{un}++;
4176 }
4177 }
4178 elsif ($methylation_calls[$index] eq '.') {}
4179 elsif (lc$methylation_calls[$index] eq 'u'){}
4180 else{
4181 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4182 }
4183 }
4184 } elsif ($strand eq '-') {
4185 for my $index (0..$#methylation_calls) {
4186
4187 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
4188 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
4189 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4190 $cigar_offset += $cigar_mod;
4191 $pos_offset += $pos_mod;
4192 }
4193
4194 if ($methylation_calls[$index] eq 'X') {
4195 $counting{total_meCHG_count}++;
4196 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4197 if ($read_identity == 1){
4198 $mbias_1{CHG}->{$index+1}->{meth}++;
4199 }
4200 else{
4201 $mbias_2{CHG}->{$index+1}->{meth}++;
4202 }
4203 }
4204 elsif ($methylation_calls[$index] eq 'x') {
4205 $counting{total_unmethylated_CHG_count}++;
4206 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4207 if ($read_identity == 1){
4208 $mbias_1{CHG}->{$index+1}->{un}++;
4209 }
4210 else{
4211 $mbias_2{CHG}->{$index+1}->{un}++;
4212 }
4213 }
4214 elsif ($methylation_calls[$index] eq 'Z') {
4215 $counting{total_meCpG_count}++;
4216 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4217 if ($read_identity == 1){
4218 $mbias_1{CpG}->{$index+1}->{meth}++;
4219 }
4220 else{
4221 $mbias_2{CpG}->{$index+1}->{meth}++;
4222 }
4223 }
4224 elsif ($methylation_calls[$index] eq 'z') {
4225 $counting{total_unmethylated_CpG_count}++;
4226 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4227 if ($read_identity == 1){
4228 $mbias_1{CpG}->{$index+1}->{un}++;
4229 }
4230 else{
4231 $mbias_2{CpG}->{$index+1}->{un}++;
4232 }
4233 }
4234 elsif ($methylation_calls[$index] eq 'H') {
4235 $counting{total_meCHH_count}++;
4236 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4237 if ($read_identity == 1){
4238 $mbias_1{CHH}->{$index+1}->{meth}++;
4239 }
4240 else{
4241 $mbias_2{CHH}->{$index+1}->{meth}++;
4242 }
4243 }
4244 elsif ($methylation_calls[$index] eq 'h') {
4245 $counting{total_unmethylated_CHH_count}++;
4246 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4247 if ($read_identity == 1){
4248 $mbias_1{CHH}->{$index+1}->{un}++;
4249 }
4250 else{
4251 $mbias_2{CHH}->{$index+1}->{un}++;
4252 }
4253 }
4254 elsif ($methylation_calls[$index] eq '.') {}
4255 elsif (lc$methylation_calls[$index] eq 'u'){}
4256 else{
4257 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4258 }
4259 }
4260 } else {
4261 die "The strand orientation as neither + nor -: '$strand'\n";
4262 }
4263 }
4264 }
4265 }
4266
4267 sub check_cigar_string {
4268 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
4269 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
4270 my ($new_cigar_offset,$new_pos_offset) = (0,0);
4271
4272 if ($strand eq '+') {
4273 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
4274
4275 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
4276 # warn "position needs no adjustment\n";
4277 }
4278
4279 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
4280 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
4281 # warn "adjusted genomic position by -1 bp (insertion)\n";
4282 }
4283
4284 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
4285 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
4286 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
4287 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
4288
4289 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
4290 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
4291 # warn "position needs no adjustment\n";
4292 last;
4293 }
4294 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
4295 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
4296 # warn "adjusted genomic position by another -1 bp (insertion)\n";
4297 last;
4298 }
4299 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
4300 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
4301 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
4302 # warn "adjusted genomic position by another +1 bp (deletion)\n";
4303 }
4304 else{
4305 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
4306 }
4307 }
4308 }
4309 else{
4310 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
4311 }
4312 }
4313
4314 elsif ($strand eq '-') {
4315 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
4316
4317 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
4318 # warn "position needs no adjustment\n";
4319 }
4320
4321 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
4322 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
4323 # warn "adjusted genomic position by +1 bp (insertion)\n";
4324 }
4325
4326 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
4327 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
4328 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
4329 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
4330
4331 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
4332 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
4333 # warn "Found new 'M' operation; position needs no adjustment\n";
4334 last;
4335 }
4336 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
4337 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
4338 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
4339 last;
4340 }
4341 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
4342 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
4343 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
4344 # warn "adjusted genomic position by another -1 bp (deletion)\n";
4345 }
4346 else{
4347 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
4348 }
4349 }
4350 }
4351 else{
4352 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
4353 }
4354 }
4355 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
4356 return ($new_cigar_offset,$new_pos_offset);
4357 }
4358
4359 sub print_individual_C_methylation_states_single_end{
4360
4361 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
4362 my @methylation_calls = split(//,$meth_call);
4363
4364 #################################################################
4365 ### . for bases not involving cytosines ###
4366 ### X for methylated C in CHG context (was protected) ###
4367 ### x for not methylated C in CHG context (was converted) ###
4368 ### H for methylated C in CHH context (was protected) ###
4369 ### h for not methylated C in CHH context (was converted) ###
4370 ### Z for methylated C in CpG context (was protected) ###
4371 ### z for not methylated C in CpG context (was converted) ###
4372 #################################################################
4373
4374 my $methyl_CHG_count = 0;
4375 my $methyl_CHH_count = 0;
4376 my $methyl_CpG_count = 0;
4377 my $unmethylated_CHG_count = 0;
4378 my $unmethylated_CHH_count = 0;
4379 my $unmethylated_CpG_count = 0;
4380
4381 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
4382 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
4383
4384 my @comp_cigar;
4385
4386 if ($cigar){ # parsing CIGAR string
4387
4388 ### Checking whether the CIGAR string is a linear genomic match or whether it requires indel processing
4389 if ($cigar =~ /^\d+M$/){
4390 # warn "See!? I told you so! $cigar\n";
4391 # sleep(1);
4392 }
4393 else{
4394
4395 my @len;
4396 my @ops;
4397
4398 @len = split (/\D+/,$cigar); # storing the length per operation
4399 @ops = split (/\d+/,$cigar); # storing the operation
4400 shift @ops; # remove the empty first element
4401 # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops);
4402 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
4403
4404 foreach my $index (0..$#len){
4405 foreach (1..$len[$index]){
4406 # print "$ops[$index]";
4407 push @comp_cigar, $ops[$index];
4408 }
4409 }
4410 }
4411 # warn "\nDetected CIGAR string: $cigar\n";
4412 # warn "Length of methylation call: ",length $meth_call,"\n";
4413 # warn "number of operations: ",scalar @ops,"\n";
4414 # warn "number of length digits: ",scalar @len,"\n\n";
4415 # print @comp_cigar,"\n";
4416 # print "$meth_call\n\n";
4417 # sleep (1);
4418 }
4419
4420 ### adjusting the start position for all reads mapping to the reverse strand
4421 if ($strand eq '-') {
4422
4423 if (@comp_cigar){ # only needed for SAM reads with InDels
4424 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
4425 # print @comp_cigar,"\n";
4426 }
4427
4428 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
4429
4430 if ($cigar){ ### SAM format
4431 if ($cigar =~ /^(\d+)M$/){ # linear match
4432 $start += $1 - 1;
4433 }
4434 else{ # InDel read
4435 my $MD_count = 0;
4436 foreach (@comp_cigar){
4437 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
4438 }
4439 $start += $MD_count - 1;
4440 }
4441 }
4442 else{ ### vanilla format
4443 $start += length($meth_call)-1;
4444 }
4445 }
4446 }
4447
4448 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
4449
4450 ### single-file CpG and other-context output
4451 if ($full and $merge_non_CpG) {
4452 if ($strand eq '+') {
4453 for my $index (0..$#methylation_calls) {
4454
4455 if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels
4456 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4457 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
4458 $cigar_offset += $cigar_mod;
4459 $pos_offset += $pos_mod;
4460 }
4461
4462 ### methylated Cs (any context) will receive a forward (+) orientation
4463 ### not methylated Cs (any context) will receive a reverse (-) orientation
4464 if ($methylation_calls[$index] eq 'X') {
4465 $counting{total_meCHG_count}++;
4466 my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
4467 print {$fhs{other_context}} $line unless($mbias_only);
4468 # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4469 $mbias_1{CHG}->{$index+1}->{meth}++;
4470 }
4471 elsif ($methylation_calls[$index] eq 'x') {
4472 $counting{total_unmethylated_CHG_count}++;
4473 my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
4474 print {$fhs{other_context}} $line unless($mbias_only);
4475 # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4476 $mbias_1{CHG}->{$index+1}->{un}++;
4477 }
4478 elsif ($methylation_calls[$index] eq 'Z') {
4479 $counting{total_meCpG_count}++;
4480 my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
4481 print {$fhs{CpG_context}} $line unless($mbias_only);
4482 # print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4483 $mbias_1{CpG}->{$index+1}->{meth}++;
4484 }
4485 elsif ($methylation_calls[$index] eq 'z') {
4486 $counting{total_unmethylated_CpG_count}++;
4487 my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
4488 print {$fhs{CpG_context}} $line unless($mbias_only);
4489 # print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4490 $mbias_1{CpG}->{$index+1}->{un}++;
4491 }
4492 elsif ($methylation_calls[$index] eq 'H') {
4493 $counting{total_meCHH_count}++;
4494 my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
4495 print {$fhs{other_context}} $line unless($mbias_only);
4496 # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4497 $mbias_1{CHH}->{$index+1}->{meth}++;
4498 }
4499 elsif ($methylation_calls[$index] eq 'h') {
4500 $counting{total_unmethylated_CHH_count}++;
4501 my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
4502 print {$fhs{other_context}} $line unless($mbias_only);
4503 # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4504 $mbias_1{CHH}->{$index+1}->{un}++;
4505 }
4506 elsif ($methylation_calls[$index] eq '.') {}
4507 elsif (lc$methylation_calls[$index] eq 'u'){}
4508 else{
4509 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4510 }
4511 }
4512 }
4513 elsif ($strand eq '-') {
4514
4515 for my $index (0..$#methylation_calls) {
4516 ### methylated Cs (any context) will receive a forward (+) orientation
4517 ### not methylated Cs (any context) will receive a reverse (-) orientation
4518
4519 if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels
4520 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
4521 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4522 $cigar_offset += $cigar_mod;
4523 $pos_offset += $pos_mod;
4524 }
4525
4526 if ($methylation_calls[$index] eq 'X') {
4527 $counting{total_meCHG_count}++;
4528 my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
4529 print {$fhs{other_context}} $line unless($mbias_only);
4530 #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4531 $mbias_1{CHG}->{$index+1}->{meth}++;
4532 }
4533 elsif ($methylation_calls[$index] eq 'x') {
4534 $counting{total_unmethylated_CHG_count}++;
4535 my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
4536 print {$fhs{other_context}} $line unless($mbias_only);
4537 #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4538 $mbias_1{CHG}->{$index+1}->{un}++;
4539 }
4540 elsif ($methylation_calls[$index] eq 'Z') {
4541 $counting{total_meCpG_count}++;
4542 my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
4543 print {$fhs{CpG_context}} $line unless($mbias_only);
4544 #print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4545 $mbias_1{CpG}->{$index+1}->{meth}++;
4546 }
4547 elsif ($methylation_calls[$index] eq 'z') {
4548 $counting{total_unmethylated_CpG_count}++;
4549 my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
4550 print {$fhs{CpG_context}} $line unless($mbias_only);
4551 #print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4552 $mbias_1{CpG}->{$index+1}->{un}++;
4553 }
4554 elsif ($methylation_calls[$index] eq 'H') {
4555 $counting{total_meCHH_count}++;
4556 my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
4557 print {$fhs{other_context}} $line unless($mbias_only);
4558 #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4559 $mbias_1{CHH}->{$index+1}->{meth}++;
4560 }
4561 elsif ($methylation_calls[$index] eq 'h') {
4562 $counting{total_unmethylated_CHH_count}++;
4563 my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
4564 print {$fhs{other_context}} $line unless($mbias_only);
4565 #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4566 $mbias_1{CHH}->{$index+1}->{un}++;
4567 }
4568 elsif ($methylation_calls[$index] eq '.'){}
4569 elsif (lc$methylation_calls[$index] eq 'u'){}
4570 else{
4571 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4572 }
4573 }
4574 }
4575 else {
4576 die "The strand information was neither + nor -: $strand\n";
4577 }
4578 }
4579
4580 ### strand-specific methylation output
4581 elsif ($merge_non_CpG) {
4582 if ($strand eq '+') {
4583 for my $index (0..$#methylation_calls) {
4584 ### methylated Cs (any context) will receive a forward (+) orientation
4585 ### not methylated Cs (any context) will receive a reverse (-) orientation
4586
4587 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4588 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4589 $cigar_offset += $cigar_mod;
4590 $pos_offset += $pos_mod;
4591 }
4592
4593 if ($methylation_calls[$index] eq 'X') {
4594 $counting{total_meCHG_count}++;
4595 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4596 $mbias_1{CHG}->{$index+1}->{meth}++;
4597 }
4598 elsif ($methylation_calls[$index] eq 'x') {
4599 $counting{total_unmethylated_CHG_count}++;
4600 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4601 $mbias_1{CHG}->{$index+1}->{un}++;
4602 }
4603 elsif ($methylation_calls[$index] eq 'Z') {
4604 $counting{total_meCpG_count}++;
4605 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4606 $mbias_1{CpG}->{$index+1}->{meth}++;
4607 }
4608 elsif ($methylation_calls[$index] eq 'z') {
4609 $counting{total_unmethylated_CpG_count}++;
4610 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4611 $mbias_1{CpG}->{$index+1}->{un}++;
4612 }
4613 elsif ($methylation_calls[$index] eq 'H') {
4614 $counting{total_meCHH_count}++;
4615 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4616 $mbias_1{CHH}->{$index+1}->{meth}++;
4617 }
4618 elsif ($methylation_calls[$index] eq 'h') {
4619 $counting{total_unmethylated_CHH_count}++;
4620 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4621 $mbias_1{CHH}->{$index+1}->{un}++;
4622 }
4623 elsif ($methylation_calls[$index] eq '.') {}
4624 elsif (lc$methylation_calls[$index] eq 'u'){}
4625 else{
4626 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4627 }
4628 }
4629 }
4630 elsif ($strand eq '-') {
4631
4632 for my $index (0..$#methylation_calls) {
4633 ### methylated Cs (any context) will receive a forward (+) orientation
4634 ### not methylated Cs (any context) will receive a reverse (-) orientation
4635
4636 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4637 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4638 $cigar_offset += $cigar_mod;
4639 $pos_offset += $pos_mod;
4640 }
4641
4642 if ($methylation_calls[$index] eq 'X') {
4643 $counting{total_meCHG_count}++;
4644 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4645 $mbias_1{CHG}->{$index+1}->{meth}++;
4646 }
4647 elsif ($methylation_calls[$index] eq 'x') {
4648 $counting{total_unmethylated_CHG_count}++;
4649 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4650 $mbias_1{CHG}->{$index+1}->{un}++;
4651 }
4652 elsif ($methylation_calls[$index] eq 'Z') {
4653 $counting{total_meCpG_count}++;
4654 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4655 $mbias_1{CpG}->{$index+1}->{meth}++;
4656 }
4657 elsif ($methylation_calls[$index] eq 'z') {
4658 $counting{total_unmethylated_CpG_count}++;
4659 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4660 $mbias_1{CpG}->{$index+1}->{un}++;
4661 }
4662 elsif ($methylation_calls[$index] eq 'H') {
4663 $counting{total_meCHH_count}++;
4664 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4665 $mbias_1{CHH}->{$index+1}->{meth}++;
4666 }
4667 elsif ($methylation_calls[$index] eq 'h') {
4668 $counting{total_unmethylated_CHH_count}++;
4669 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4670 $mbias_1{CHH}->{$index+1}->{un}++;
4671 }
4672 elsif ($methylation_calls[$index] eq '.') {}
4673 elsif (lc$methylation_calls[$index] eq 'u'){}
4674 else{
4675 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4676 }
4677 }
4678 }
4679 else {
4680 die "The strand information was neither + nor -: $strand\n";
4681 }
4682 }
4683
4684 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
4685
4686 elsif ($full) {
4687 if ($strand eq '+') {
4688 for my $index (0..$#methylation_calls) {
4689 ### methylated Cs (any context) will receive a forward (+) orientation
4690 ### not methylated Cs (any context) will receive a reverse (-) orientation
4691
4692 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4693 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4694 $cigar_offset += $cigar_mod;
4695 $pos_offset += $pos_mod;
4696 }
4697
4698 if ($methylation_calls[$index] eq 'X') {
4699 $counting{total_meCHG_count}++;
4700 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4701 $mbias_1{CHG}->{$index+1}->{meth}++;
4702 }
4703 elsif ($methylation_calls[$index] eq 'x') {
4704 $counting{total_unmethylated_CHG_count}++;
4705 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4706 $mbias_1{CHG}->{$index+1}->{un}++;
4707 }
4708 elsif ($methylation_calls[$index] eq 'Z') {
4709 $counting{total_meCpG_count}++;
4710 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4711 $mbias_1{CpG}->{$index+1}->{meth}++;
4712 }
4713 elsif ($methylation_calls[$index] eq 'z') {
4714 $counting{total_unmethylated_CpG_count}++;
4715 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4716 $mbias_1{CpG}->{$index+1}->{un}++;
4717 }
4718 elsif ($methylation_calls[$index] eq 'H') {
4719 $counting{total_meCHH_count}++;
4720 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4721 $mbias_1{CHH}->{$index+1}->{meth}++;
4722 }
4723 elsif ($methylation_calls[$index] eq 'h') {
4724 $counting{total_unmethylated_CHH_count}++;
4725 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4726 $mbias_1{CHH}->{$index+1}->{un}++;
4727 }
4728 elsif ($methylation_calls[$index] eq '.') {}
4729 elsif (lc$methylation_calls[$index] eq 'u'){}
4730 else{
4731 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
4732 }
4733 }
4734 }
4735 elsif ($strand eq '-') {
4736
4737 for my $index (0..$#methylation_calls) {
4738 ### methylated Cs (any context) will receive a forward (+) orientation
4739 ### not methylated Cs (any context) will receive a reverse (-) orientation
4740
4741 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4742 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4743 $cigar_offset += $cigar_mod;
4744 $pos_offset += $pos_mod;
4745 }
4746
4747 if ($methylation_calls[$index] eq 'X') {
4748 $counting{total_meCHG_count}++;
4749 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4750 $mbias_1{CHG}->{$index+1}->{meth}++;
4751 }
4752 elsif ($methylation_calls[$index] eq 'x') {
4753 $counting{total_unmethylated_CHG_count}++;
4754 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4755 $mbias_1{CHG}->{$index+1}->{un}++;
4756 }
4757 elsif ($methylation_calls[$index] eq 'Z') {
4758 $counting{total_meCpG_count}++;
4759 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4760 $mbias_1{CpG}->{$index+1}->{meth}++;
4761 }
4762 elsif ($methylation_calls[$index] eq 'z') {
4763 $counting{total_unmethylated_CpG_count}++;
4764 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4765 $mbias_1{CpG}->{$index+1}->{un}++;
4766 }
4767 elsif ($methylation_calls[$index] eq 'H') {
4768 $counting{total_meCHH_count}++;
4769 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4770 $mbias_1{CHH}->{$index+1}->{meth}++;
4771 }
4772 elsif ($methylation_calls[$index] eq 'h') {
4773 $counting{total_unmethylated_CHH_count}++;
4774 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4775 $mbias_1{CHH}->{$index+1}->{un}++;
4776 }
4777 elsif ($methylation_calls[$index] eq '.') {}
4778 elsif (lc$methylation_calls[$index] eq 'u'){}
4779 else{
4780 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4781 }
4782 }
4783 }
4784 else {
4785 die "The read had a strand orientation which was neither + nor -: $strand\n";
4786 }
4787 }
4788
4789 ### strand-specific methylation output
4790 else {
4791 if ($strand eq '+') {
4792 for my $index (0..$#methylation_calls) {
4793 ### methylated Cs (any context) will receive a forward (+) orientation
4794 ### not methylated Cs (any context) will receive a reverse (-) orientation
4795
4796 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4797 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4798 $cigar_offset += $cigar_mod;
4799 $pos_offset += $pos_mod;
4800 }
4801
4802 if ($methylation_calls[$index] eq 'X') {
4803 $counting{total_meCHG_count}++;
4804 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4805 $mbias_1{CHG}->{$index+1}->{meth}++;
4806 }
4807 elsif ($methylation_calls[$index] eq 'x') {
4808 $counting{total_unmethylated_CHG_count}++;
4809 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4810 $mbias_1{CHG}->{$index+1}->{un}++;
4811 }
4812 elsif ($methylation_calls[$index] eq 'Z') {
4813 $counting{total_meCpG_count}++;
4814 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4815 $mbias_1{CpG}->{$index+1}->{meth}++;
4816 }
4817 elsif ($methylation_calls[$index] eq 'z') {
4818 $counting{total_unmethylated_CpG_count}++;
4819 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4820 $mbias_1{CpG}->{$index+1}->{un}++;
4821 }
4822 elsif ($methylation_calls[$index] eq 'H') {
4823 $counting{total_meCHH_count}++;
4824 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4825 $mbias_1{CHH}->{$index+1}->{meth}++;
4826 }
4827 elsif ($methylation_calls[$index] eq 'h') {
4828 $counting{total_unmethylated_CHH_count}++;
4829 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4830 $mbias_1{CHH}->{$index+1}->{un}++;
4831 }
4832 elsif ($methylation_calls[$index] eq '.') {}
4833 elsif (lc$methylation_calls[$index] eq 'u'){}
4834 else{
4835 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4836 }
4837 }
4838 }
4839 elsif ($strand eq '-') {
4840
4841 for my $index (0..$#methylation_calls) {
4842 ### methylated Cs (any context) will receive a forward (+) orientation
4843 ### not methylated Cs (any context) will receive a reverse (-) orientation
4844
4845 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4846 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4847 $cigar_offset += $cigar_mod;
4848 $pos_offset += $pos_mod;
4849 }
4850
4851 if ($methylation_calls[$index] eq 'X') {
4852 $counting{total_meCHG_count}++;
4853 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4854 $mbias_1{CHG}->{$index+1}->{meth}++;
4855 }
4856 elsif ($methylation_calls[$index] eq 'x') {
4857 $counting{total_unmethylated_CHG_count}++;
4858 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4859 $mbias_1{CHG}->{$index+1}->{un}++;
4860 }
4861 elsif ($methylation_calls[$index] eq 'Z') {
4862 $counting{total_meCpG_count}++;
4863 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4864 $mbias_1{CpG}->{$index+1}->{meth}++;
4865 }
4866 elsif ($methylation_calls[$index] eq 'z') {
4867 $counting{total_unmethylated_CpG_count}++;
4868 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4869 $mbias_1{CpG}->{$index+1}->{un}++;
4870 }
4871 elsif ($methylation_calls[$index] eq 'H') {
4872 $counting{total_meCHH_count}++;
4873 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4874 $mbias_1{CHH}->{$index+1}->{meth}++;
4875 }
4876 elsif ($methylation_calls[$index] eq 'h') {
4877 $counting{total_unmethylated_CHH_count}++;
4878 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4879 $mbias_1{CHH}->{$index+1}->{un}++;
4880 }
4881 elsif ($methylation_calls[$index] eq '.') {}
4882 elsif (lc$methylation_calls[$index] eq 'u'){}
4883 else{
4884 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4885 }
4886 }
4887 }
4888 else {
4889 die "The strand information was neither + nor -: $strand\n";
4890 }
4891 }
4892 }
4893
4894 sub open_output_filehandles{
4895
4896 my $filename = shift;
4897
4898 my $output_filename = (split (/\//,$filename))[-1];
4899 my $report_filename = $output_filename;
4900
4901 ### OPENING OUTPUT-FILEHANDLES
4902 if ($report) {
4903 $report_filename =~ s/\.sam$//;
4904 $report_filename =~ s/\.bam$//;
4905 $report_filename =~ s/\.cram$//;
4906 $report_filename =~ s/\.txt$//;
4907 $report_filename =~ s/$/_splitting_report.txt/;
4908 $report_filename = $output_dir . $report_filename;
4909 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
4910 }
4911
4912 if ($report) {
4913
4914 print REPORT "$output_filename\n\n";
4915 print REPORT "Parameters used to extract methylation information:\n";
4916 print REPORT "Bismark Extractor Version: $version\n";
4917
4918 if ($paired) {
4919 if ($vanilla) {
4920 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
4921 } else {
4922 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
4923 }
4924 }
4925
4926 if ($single) {
4927 if ($vanilla) {
4928 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
4929 } else {
4930 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
4931 }
4932 }
4933 if ($single){
4934 if ($ignore) {
4935 print REPORT "Ignoring first $ignore bp\n";
4936 }
4937 if ($ignore_3prime) {
4938 print REPORT "Ignoring last $ignore_3prime bp\n";
4939 }
4940 }
4941 else{ # paired-end
4942 if ($ignore) {
4943 print REPORT "Ignoring first $ignore bp of Read 1\n";
4944 }
4945 if ($ignore_r2){
4946 print REPORT "Ignoring first $ignore_r2 bp of Read 2\n";
4947 }
4948
4949 if ($ignore_3prime) {
4950 print REPORT "Ignoring last $ignore_3prime bp of Read 1\n";
4951 }
4952 if ($ignore_3prime_r2){
4953 print REPORT "Ignoring last $ignore_3prime_r2 bp of Read 2\n";
4954 }
4955
4956 }
4957
4958 if ($full) {
4959 print REPORT "Output specified: comprehensive\n";
4960 } else {
4961 print REPORT "Output specified: strand-specific (default)\n";
4962 }
4963
4964 if ($no_overlap) {
4965 print REPORT "No overlapping methylation calls specified\n";
4966 }
4967 if ($genomic_fasta) {
4968 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
4969 }
4970 if ($merge_non_CpG) {
4971 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
4972 }
4973
4974 print REPORT "\n";
4975 }
4976
4977 ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
4978
4979 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
4980 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
4981 if ($full and $merge_non_CpG) {
4982 my $cpg_output = my $other_c_output = $output_filename;
4983 ### C in CpG context
4984 $cpg_output =~ s/^/CpG_context_/;
4985 $cpg_output =~ s/sam$/txt/;
4986 $cpg_output =~ s/bam$/txt/;
4987 $cpg_output =~ s/cram$/txt/;
4988 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
4989 $cpg_output = $output_dir . $cpg_output;
4990
4991 if ($gzip){
4992 $cpg_output .= '.gz';
4993 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
4994 }
4995 else{ ### disclaimer: I am aware of "The Useless Use of Cat Awards", but I saw no other option...
4996 open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
4997 # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
4998 }
4999
5000 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
5001 push @sorting_files,$cpg_output;
5002
5003 unless ($no_header) {
5004 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5005 }
5006
5007 ### C in any other context than CpG
5008 $other_c_output =~ s/^/Non_CpG_context_/;
5009 $other_c_output =~ s/sam$/txt/;
5010 $other_c_output =~ s/bam$/txt/;
5011 $other_c_output =~ s/cram$/txt/;
5012 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
5013 $other_c_output = $output_dir . $other_c_output;
5014
5015 if ($gzip){
5016 $other_c_output .= '.gz';
5017 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only);
5018 }
5019 else{
5020 open ($fhs{other_context},"| cat > $other_c_output") or die "Failed to write to $other_c_output $!\n" unless($mbias_only);
5021 # open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only);
5022 }
5023
5024 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only);
5025 push @sorting_files,$other_c_output;
5026
5027
5028 unless ($no_header) {
5029 print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5030 }
5031 }
5032
5033 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
5034 elsif ($merge_non_CpG) {
5035
5036 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
5037
5038 ### For cytosines in CpG context
5039 $cpg_ot =~ s/^/CpG_OT_/;
5040 $cpg_ot =~ s/sam$/txt/;
5041 $cpg_ot =~ s/bam$/txt/;
5042 $cpg_ot =~ s/cram$/txt/;
5043 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
5044 $cpg_ot = $output_dir . $cpg_ot;
5045
5046 if ($gzip){
5047 $cpg_ot .= '.gz';
5048 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
5049 }
5050 else{
5051 open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
5052 # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
5053 }
5054
5055 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
5056 push @sorting_files,$cpg_ot;
5057
5058 unless($no_header){
5059 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5060 }
5061
5062 $cpg_ctot =~ s/^/CpG_CTOT_/;
5063 $cpg_ctot =~ s/sam$/txt/;
5064 $cpg_ctot =~ s/bam$/txt/;
5065 $cpg_ctot =~ s/cram$/txt/;
5066 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
5067 $cpg_ctot = $output_dir . $cpg_ctot;
5068
5069 if ($gzip){
5070 $cpg_ctot .= '.gz';
5071 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
5072 }
5073 else{
5074 open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
5075 # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
5076 }
5077
5078 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
5079 push @sorting_files,$cpg_ctot;
5080
5081 unless($no_header){
5082 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5083 }
5084
5085 $cpg_ctob =~ s/^/CpG_CTOB_/;
5086 $cpg_ctob =~ s/sam$/txt/;
5087 $cpg_ctob =~ s/bam$/txt/;
5088 $cpg_ctob =~ s/cram$/txt/;
5089 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
5090 $cpg_ctob = $output_dir . $cpg_ctob;
5091
5092 if ($gzip){
5093 $cpg_ctob .= '.gz';
5094 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
5095 }
5096 else{
5097 open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
5098 # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
5099 }
5100
5101 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
5102 push @sorting_files,$cpg_ctob;
5103
5104 unless($no_header){
5105 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5106 }
5107
5108 $cpg_ob =~ s/^/CpG_OB_/;
5109 $cpg_ob =~ s/sam$/txt/;
5110 $cpg_ob =~ s/bam$/txt/;
5111 $cpg_ob =~ s/cram$/txt/;
5112 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
5113 $cpg_ob = $output_dir . $cpg_ob;
5114
5115 if ($gzip){
5116 $cpg_ob .= '.gz';
5117 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
5118 }
5119 else{
5120 open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
5121 # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
5122 }
5123
5124 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
5125 push @sorting_files,$cpg_ob;
5126
5127 unless($no_header){
5128 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5129 }
5130
5131 ### For cytosines in Non-CpG (CC, CT or CA) context
5132 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
5133
5134 $other_c_ot =~ s/^/Non_CpG_OT_/;
5135 $other_c_ot =~ s/sam$/txt/;
5136 $other_c_ot =~ s/bam$/txt/;
5137 $other_c_ot =~ s/cram$/txt/;
5138 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
5139 $other_c_ot = $output_dir . $other_c_ot;
5140
5141 if ($gzip){
5142 $other_c_ot .= '.gz';
5143 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
5144 }
5145 else{
5146 open ($fhs{0}->{other_c},"| cat > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
5147 # open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
5148 }
5149
5150 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only);
5151 push @sorting_files,$other_c_ot;
5152
5153 unless($no_header){
5154 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5155 }
5156
5157 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
5158 $other_c_ctot =~ s/sam$/txt/;
5159 $other_c_ctot =~ s/bam$/txt/;
5160 $other_c_ctot =~ s/cram$/txt/;
5161 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
5162 $other_c_ctot = $output_dir . $other_c_ctot;
5163
5164 if ($gzip){
5165 $other_c_ctot .= '.gz';
5166 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
5167 }
5168 else{
5169 open ($fhs{1}->{other_c},"| cat > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
5170 # open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
5171 }
5172
5173 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only);
5174 push @sorting_files,$other_c_ctot;
5175
5176 unless($no_header){
5177 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5178 }
5179
5180 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
5181 $other_c_ctob =~ s/sam$/txt/;
5182 $other_c_ctob =~ s/bam$/txt/;
5183 $other_c_ctob =~ s/cram$/txt/;
5184 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
5185 $other_c_ctob = $output_dir . $other_c_ctob;
5186
5187 if ($gzip){
5188 $other_c_ctob .= '.gz';
5189 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
5190 }
5191 else{
5192 open ($fhs{2}->{other_c},"| cat > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
5193 # open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
5194 }
5195
5196 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only);
5197 push @sorting_files,$other_c_ctob;
5198
5199 unless($no_header){
5200 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5201 }
5202
5203 $other_c_ob =~ s/^/Non_CpG_OB_/;
5204 $other_c_ob =~ s/sam$/txt/;
5205 $other_c_ob =~ s/sam$/txt/;
5206 $other_c_ob =~ s/cram$/txt/;
5207 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
5208 $other_c_ob = $output_dir . $other_c_ob;
5209
5210 if ($gzip){
5211 $other_c_ob .= '.gz';
5212 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
5213 }
5214 else{
5215 open ($fhs{3}->{other_c},"| cat > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
5216 # open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
5217 }
5218
5219 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only);
5220 push @sorting_files,$other_c_ob;
5221
5222 unless($no_header){
5223 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5224 }
5225 }
5226
5227 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
5228
5229 ### if --comprehensive was specified we are only writing one file per context
5230 elsif ($full) {
5231 my $cpg_output = my $chg_output = my $chh_output = $output_filename;
5232 ### C in CpG context
5233 $cpg_output =~ s/^/CpG_context_/;
5234 $cpg_output =~ s/sam$/txt/;
5235 $cpg_output =~ s/bam$/txt/;
5236 $cpg_output =~ s/cram$/txt/;
5237 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
5238 $cpg_output = $output_dir . $cpg_output;
5239
5240 if ($gzip){
5241 $cpg_output .= '.gz';
5242 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
5243 }
5244 else{
5245 open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
5246 # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
5247 }
5248
5249 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
5250 push @sorting_files,$cpg_output;
5251
5252 unless($no_header){
5253 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5254 }
5255
5256 ### C in CHG context
5257 $chg_output =~ s/^/CHG_context_/;
5258 $chg_output =~ s/sam$/txt/;
5259 $chg_output =~ s/bam$/txt/;
5260 $chg_output =~ s/cram$/txt/;
5261 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
5262 $chg_output = $output_dir . $chg_output;
5263
5264 if ($gzip){
5265 $chg_output .= '.gz';
5266 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only);
5267 }
5268 else{
5269 open ($fhs{CHG_context},"| cat > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only);
5270 # open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only);
5271 }
5272
5273 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only);
5274 push @sorting_files,$chg_output;
5275
5276 unless($no_header){
5277 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5278 }
5279
5280 ### C in CHH context
5281 $chh_output =~ s/^/CHH_context_/;
5282 $chh_output =~ s/sam$/txt/;
5283 $chh_output =~ s/bam$/txt/;
5284 $chh_output =~ s/cram$/txt/;
5285 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
5286 $chh_output = $output_dir . $chh_output;
5287
5288 if ($gzip){
5289 $chh_output .= '.gz';
5290 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only);
5291 }
5292 else{
5293 open ($fhs{CHH_context},"| cat > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only);
5294 # open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only);
5295 }
5296
5297 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only);
5298 push @sorting_files, $chh_output;
5299
5300 unless($no_header){
5301 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5302 }
5303 }
5304 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
5305 else {
5306 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
5307
5308 ### For cytosines in CpG context
5309 $cpg_ot =~ s/^/CpG_OT_/;
5310 $cpg_ot =~ s/sam$/txt/;
5311 $cpg_ot =~ s/bam$/txt/;
5312 $cpg_ot =~ s/cram$/txt/;
5313 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
5314 $cpg_ot = $output_dir . $cpg_ot;
5315
5316 if ($gzip){
5317 $cpg_ot .= '.gz';
5318 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
5319 }
5320 else{
5321 open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
5322 # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
5323 }
5324
5325 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
5326 push @sorting_files,$cpg_ot;
5327
5328 unless($no_header){
5329 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5330 }
5331
5332 $cpg_ctot =~ s/^/CpG_CTOT_/;
5333 $cpg_ctot =~ s/sam$/txt/;
5334 $cpg_ctot =~ s/bam$/txt/;
5335 $cpg_ctot =~ s/cram$/txt/;
5336 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
5337 $cpg_ctot = $output_dir . $cpg_ctot;
5338
5339 if ($gzip){
5340 $cpg_ctot .= '.gz';
5341 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
5342 }
5343 else{
5344 open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
5345 # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
5346 }
5347
5348 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
5349 push @sorting_files,$cpg_ctot;
5350
5351 unless($no_header){
5352 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5353 }
5354
5355 $cpg_ctob =~ s/^/CpG_CTOB_/;
5356 $cpg_ctob =~ s/sam$/txt/;
5357 $cpg_ctob =~ s/bam$/txt/;
5358 $cpg_ctob =~ s/cram$/txt/;
5359 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
5360 $cpg_ctob = $output_dir . $cpg_ctob;
5361
5362 if ($gzip){
5363 $cpg_ctob .= '.gz';
5364 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
5365 }
5366 else{
5367 open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
5368 # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
5369 }
5370
5371 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
5372 push @sorting_files,$cpg_ctob;
5373
5374 unless($no_header){
5375 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5376 }
5377
5378 $cpg_ob =~ s/^/CpG_OB_/;
5379 $cpg_ob =~ s/sam$/txt/;
5380 $cpg_ob =~ s/bam$/txt/;
5381 $cpg_ob =~ s/cram$/txt/;
5382 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
5383 $cpg_ob = $output_dir . $cpg_ob;
5384
5385 if ($gzip){
5386 $cpg_ob .= '.gz';
5387 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
5388 }
5389 else{
5390 open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
5391 # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
5392 }
5393
5394 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
5395 push @sorting_files,$cpg_ob;
5396
5397 unless($no_header){
5398 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5399 }
5400
5401 ### For cytosines in CHG context
5402 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
5403
5404 $chg_ot =~ s/^/CHG_OT_/;
5405 $chg_ot =~ s/sam$/txt/;
5406 $chg_ot =~ s/bam$/txt/;
5407 $chg_ot =~ s/cram$/txt/;
5408 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
5409 $chg_ot = $output_dir . $chg_ot;
5410
5411 if ($gzip){
5412 $chg_ot .= '.gz';
5413 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
5414 }
5415 else{
5416 open ($fhs{0}->{CHG},"| cat > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
5417 # open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
5418 }
5419
5420 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only);
5421 push @sorting_files,$chg_ot;
5422
5423 unless($no_header){
5424 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5425 }
5426
5427 $chg_ctot =~ s/^/CHG_CTOT_/;
5428 $chg_ctot =~ s/sam$/txt/;
5429 $chg_ctot =~ s/bam$/txt/;
5430 $chg_ctot =~ s/cram$/txt/;
5431 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
5432 $chg_ctot = $output_dir . $chg_ctot;
5433
5434 if ($gzip){
5435 $chg_ctot .= '.gz';
5436 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
5437 }
5438 else{
5439 open ($fhs{1}->{CHG},"| cat > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
5440 # open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
5441 }
5442
5443 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only);
5444 push @sorting_files,$chg_ctot;
5445
5446 unless($no_header){
5447 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5448 }
5449
5450 $chg_ctob =~ s/^/CHG_CTOB_/;
5451 $chg_ctob =~ s/sam$/txt/;
5452 $chg_ctob =~ s/bam$/txt/;
5453 $chg_ctob =~ s/cram$/txt/;
5454 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
5455 $chg_ctob = $output_dir . $chg_ctob;
5456
5457 if ($gzip){
5458 $chg_ctob .= '.gz';
5459 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
5460 }
5461 else{
5462 open ($fhs{2}->{CHG},"| cat > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
5463 # open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
5464 }
5465
5466 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only);
5467 push @sorting_files,$chg_ctob;
5468
5469 unless($no_header){
5470 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5471 }
5472
5473 $chg_ob =~ s/^/CHG_OB_/;
5474 $chg_ob =~ s/sam$/txt/;
5475 $chg_ob =~ s/bam$/txt/;
5476 $chg_ob =~ s/cram$/txt/;
5477 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
5478 $chg_ob = $output_dir . $chg_ob;
5479
5480 if ($gzip){
5481 $chg_ob .= '.gz';
5482 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
5483 }
5484 else{
5485 open ($fhs{3}->{CHG},"| cat > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
5486 # open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
5487 }
5488
5489 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only);
5490 push @sorting_files,$chg_ob;
5491
5492 unless($no_header){
5493 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5494 }
5495
5496 ### For cytosines in CHH context
5497 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
5498
5499 $chh_ot =~ s/^/CHH_OT_/;
5500 $chh_ot =~ s/sam$/txt/;
5501 $chh_ot =~ s/bam$/txt/;
5502 $chh_ot =~ s/cram$/txt/;
5503 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
5504 $chh_ot = $output_dir . $chh_ot;
5505
5506 if ($gzip){
5507 $chh_ot .= '.gz';
5508 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
5509 }
5510 else{
5511 open ($fhs{0}->{CHH},"| cat > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
5512 # open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
5513 }
5514
5515 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only);
5516 push @sorting_files,$chh_ot;
5517
5518 unless($no_header){
5519 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5520 }
5521
5522 $chh_ctot =~ s/^/CHH_CTOT_/;
5523 $chh_ctot =~ s/sam$/txt/;
5524 $chh_ctot =~ s/bam$/txt/;
5525 $chh_ctot =~ s/cram$/txt/;
5526 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
5527 $chh_ctot = $output_dir . $chh_ctot;
5528
5529 if ($gzip){
5530 $chh_ctot .= '.gz';
5531 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
5532 }
5533 else{
5534 open ($fhs{1}->{CHH},"| cat > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
5535 # open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
5536 }
5537
5538 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only);
5539 push @sorting_files,$chh_ctot;
5540
5541 unless($no_header){
5542 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5543 }
5544
5545 $chh_ctob =~ s/^/CHH_CTOB_/;
5546 $chh_ctob =~ s/sam$/txt/;
5547 $chh_ctob =~ s/bam$/txt/;
5548 $chh_ctob =~ s/cram$/txt/;
5549 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
5550 $chh_ctob = $output_dir . $chh_ctob;
5551
5552 if ($gzip){
5553 $chh_ctob .= '.gz';
5554 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
5555 }
5556 else{
5557 open ($fhs{2}->{CHH},"| cat > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
5558 # open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
5559 }
5560
5561 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only);
5562 push @sorting_files,$chh_ctob;
5563
5564 unless($no_header){
5565 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5566 }
5567
5568 $chh_ob =~ s/^/CHH_OB_/;
5569 $chh_ob =~ s/sam$/txt/;
5570 $chh_ob =~ s/bam$/txt/;
5571 $chh_ob =~ s/cram$/txt/;
5572 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
5573 $chh_ob = $output_dir . $chh_ob;
5574
5575 if ($gzip){
5576 $chh_ob .= '.gz';
5577 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
5578 }
5579 else{
5580 open ($fhs{3}->{CHH},"| cat > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
5581 # open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
5582 }
5583
5584 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only);
5585 push @sorting_files,$chh_ob;
5586
5587 unless($no_header){
5588 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5589 }
5590 }
5591 return $report_filename;
5592 }
5593
5594 sub isBam{
5595
5596 my $filename = shift;
5597
5598 # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy)
5599 open (DISGUISE,"gunzip -c $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n";
5600
5601 ### when BAM files read through a gunzip -c stream they start with BAM...
5602 my $bam_in_disguise = <DISGUISE>;
5603 # warn "BAM in disguise: $bam_in_disguise\n\n";
5604
5605 if ($bam_in_disguise){
5606 if ($bam_in_disguise =~ /^BAM/){
5607 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
5608 return 1;
5609 }
5610 else{
5611 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
5612 return 0;
5613 }
5614 }
5615 else{
5616 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
5617 return 0;
5618 }
5619 }
5620
5621
5622 sub print_helpfile{
5623
5624 print << 'HOW_TO';
5625
5626
5627 DESCRIPTION
5628
5629 The following is a brief description of all options to control the Bismark
5630 methylation extractor. The script reads in a bisulfite read alignment results file
5631 produced by the Bismark bisulfite mapper (in BAM/CRAM/SAM format) and extracts the
5632 methylation information for individual cytosines. This information is found in the
5633 methylation call field which can contain the following characters:
5634
5635 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5636 ~~~ X for methylated C in CHG context ~~~
5637 ~~~ x for not methylated C CHG ~~~
5638 ~~~ H for methylated C in CHH context ~~~
5639 ~~~ h for not methylated C in CHH context ~~~
5640 ~~~ Z for methylated C in CpG context ~~~
5641 ~~~ z for not methylated C in CpG context ~~~
5642 ~~~ U for methylated C in Unknown context (CN or CHN ~~~
5643 ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~
5644 ~~~ . for any bases not involving cytosines ~~~
5645 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5646
5647 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
5648 context (this distinction is actually already made in Bismark itself). As the methylation
5649 information for every C analysed can produce files which easily have tens or even hundreds of
5650 millions of lines, file sizes can become very large and more difficult to handle. The C
5651 methylation info additionally splits cytosine methylation calls up into one of the four possible
5652 strands a given bisulfite read aligned against:
5653
5654 OT original top strand
5655 CTOT complementary to original top strand
5656
5657 OB original bottom strand
5658 CTOB complementary to original bottom strand
5659
5660 Thus, by default twelve individual output files are being generated per input file (unless
5661 --comprehensive is specified, see below). The output files can be imported into a genome
5662 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
5663 unless the bisulfite reads were generated preserving directionality it doesn't make any
5664 sense to look at the data in a strand-specific manner). Strand-specific output files can
5665 optionally be skipped, in which case only three output files for CpG, CHG or CHH context
5666 will be generated. For both the strand-specific and comprehensive outputs there is also
5667 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
5668
5669
5670 The output files are in the following format (tab delimited):
5671
5672 <sequence_id> <strand> <chromosome> <position> <methylation call>
5673
5674
5675 USAGE: bismark_methylation_extractor [options] <filenames>
5676
5677
5678 ARGUMENTS:
5679 ==========
5680
5681 <filenames> A space-separated list of Bismark result files in SAM format from
5682 which methylation information is extracted for every cytosine in
5683 the reads. For alignment files in the older custom Bismark output
5684 see option '--vanilla'.
5685
5686 OPTIONS:
5687
5688 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end
5689 read data. If neither -s nor -p is set the type of experiment will
5690 be determined automatically.
5691
5692 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
5693 read data. If neither -s nor -p is set the type of experiment will
5694 be determined automatically.
5695
5696 --vanilla The Bismark result input file(s) are in the old custom Bismark format
5697 (up to version 0.5.x) and not in SAM format which is the default as
5698 of Bismark version 0.6.x or higher. Default: OFF.
5699
5700 --no_overlap For paired-end reads it is theoretically possible that read_1 and
5701 read_2 overlap. This option avoids scoring overlapping methylation
5702 calls twice (only methylation calls of read 1 are used for in the process
5703 since read 1 has historically higher quality basecalls than read 2).
5704 Whilst this option removes a bias towards more methylation calls
5705 in the center of sequenced fragments it may de facto remove a sizable
5706 proportion of the data. This option is on by default for paired-end data
5707 but can be disabled using --include_overlap. Default: ON.
5708
5709 --include_overlap For paired-end data all methylation calls will be extracted irrespective of
5710 of whether they overlap or not. Default: OFF.
5711
5712 --ignore <int> Ignore the first <int> bp from the 5' end of Read 1 (or single-end alignment
5713 files) when processing the methylation call string. This can remove e.g. a
5714 restriction enzyme site at the start of each read or any other source of
5715 bias (such as PBAT-Seq data).
5716
5717 --ignore_r2 <int> Ignore the first <int> bp from the 5' end of Read 2 of paired-end sequencing
5718 results only. Since the first couple of bases in Read 2 of BS-Seq experiments
5719 show a severe bias towards non-methylation as a result of end-repairing
5720 sonicated fragments with unmethylated cytosines (see M-bias plot), it is
5721 recommended that the first couple of bp of Read 2 are removed before
5722 starting downstream analysis. Please see the section on M-bias plots in the
5723 Bismark User Guide for more details.
5724
5725 --ignore_3prime <int> Ignore the last <int> bp from the 3' end of Read 1 (or single-end alignment
5726 files) when processing the methylation call string. This can remove unwanted
5727 biases from the end of reads.
5728
5729 --ignore_3prime_r2 <int> Ignore the last <int> bp from the 3' end of Read 2 of paired-end sequencing
5730 results only. This can remove unwanted biases from the end of reads.
5731
5732 --comprehensive Specifying this option will merge all four possible strand-specific
5733 methylation info into context-dependent output files. The default
5734
5735 contexts are:
5736 - CpG context
5737 - CHG context
5738 - CHH context
5739
5740 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
5741 strand-specific output files (default) for Cs in
5742 - CpG context
5743 - non-CpG context
5744
5745 --report Prints out a short methylation summary as well as the paramaters used to run
5746 this script. Default: ON.
5747
5748 --no_header Suppresses the Bismark version header line in all output files for more convenient
5749 batch processing.
5750
5751 -o/--output DIR Allows specification of a different output directory (absolute or relative
5752 path). If not specified explicitly, the output will be written to the current directory.
5753
5754 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
5755 explicitly if Samtools is in the PATH already.
5756
5757 --gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in
5758 a GZIP compressed form to save disk space. This option is also passed on to the genome-wide
5759 cytosine report. BedGraph and coverage files will be written out as .gz by default.
5760
5761 --version Displays version information.
5762
5763 -h/--help Displays this help file and exits.
5764
5765 --mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as
5766 well as a report (optional) and then quit. Default: OFF.
5767
5768 --mbias_off The methylation extractor will process the entire file as usual but doesn't write out any M-bias report.
5769 Only recommended for users who deliberately want to keep an earlier version of the M-bias report.
5770 Default: OFF.
5771
5772 --multicore <int> Sets the number of cores to be used for the methylation extraction process. If system resources
5773 are plentiful this is a viable option to speed up the extraction process (we observed a near linear
5774 speed increase for up to 10 cores used). Please note that a typical process of extracting a BAM file
5775 and writing out '.gz' output streams will in fact use ~3 cores per value of --multicore <int>
5776 specified (1 for the methylation extractor itself, 1 for a Samtools stream, 1 for GZIP stream), so
5777 --multicore 10 is likely to use around 30 cores of system resources. This option has no bearing
5778 on the bismark2bedGraph or genome-wide cytosine report processes.
5779
5780
5781
5782
5783 bedGraph specific options:
5784 ==========================
5785
5786 --bedGraph After finishing the methylation extraction, the methylation output is written into a
5787 sorted bedGraph file that reports the position of a given cytosine and its methylation
5788 state (in %, see details below). The methylation extractor output is temporarily split up into
5789 temporary files, one per chromosome (written into the current directory or folder
5790 specified with -o/--output); these temp files are then used for sorting and deleted
5791 afterwards. By default, only cytosines in CpG context will be sorted. The option
5792 '--CX_context' may be used to report all cytosines irrespective of sequence context
5793 (this will take MUCH longer!). The default folder for temporary files during the sorting
5794 process is the output directory. The bedGraph conversion step is performed by the external
5795 module 'bismark2bedGraph'; this script needs to reside in the same folder as the
5796 bismark_methylation_extractor itself.
5797
5798 --zero_based Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start
5799 and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file,
5800 instead of using 1-based coordinates throughout. Default: OFF.
5801
5802
5803 --cutoff [threshold] The minimum number of times any methylation state (methylated or unmethylated) has to be seen
5804 for a nucleotide before its methylation percentage is reported. Default: 1.
5805
5806 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
5807
5808 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
5809 in the experiment irrespective of its sequence context. This applies to both forward and
5810 reverse strands. Please be aware that this option may generate large temporary and output files
5811 and may take a long time to sort (up to many hours). Default: OFF.
5812 (i.e. Default = CpG context only).
5813
5814 --buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information.
5815 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
5816 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
5817 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
5818 Defaults to 2G.
5819
5820 --scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of
5821 scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to
5822 individual chromosome files. These errors were caused by the operating system's limit
5823 of the number of filehandle that can be written to at any one time (typically 1024; to
5824 find out this limit on Linux, type: ulimit -a).
5825 To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort
5826 methylation calls into individual chromosome files. Instead, all input files are
5827 temporarily merged into a single file (unless there is only a single file), and this
5828 file will then be sorted by both chromosome AND position using the Unix sort command.
5829 Please be aware that this option might take a looooong time to complete, depending on
5830 the size of the input files, and the memory you allocate to this process (see --buffer_size).
5831 Nevertheless, it seems to be working.
5832
5833 --ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will
5834 instead use two arrays to sort methylated and unmethylated calls. This may result in a faster
5835 sorting process of very large files, but this comes at the cost of a larger memory footprint
5836 (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB
5837 of RAM). Due to overheads in creating and looping through these arrays it seems that it will
5838 actually be *slower* for small files (few million alignments), and we are currently testing at
5839 which point it is advisable to use this option. Note that --ample_memory is not compatible
5840 with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with).
5841
5842
5843
5844 Genome-wide cytosine methylation report specific options:
5845 =========================================================
5846
5847 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a
5848 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based
5849 chromosome coordinates (zero-based start coords are optional) and reports CpG context only (all
5850 cytosine context is optional). The output considers all Cs on both forward and reverse strands and
5851 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not
5852 covered). The cytosine report conversion step is performed by the external module
5853 'coverage2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor
5854 itself.
5855
5856 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of
5857 its context. This applies to both forward and reverse strands. Please be aware that this will
5858 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
5859 Default: OFF (i.e. Default = CpG context only).
5860
5861 --zero_based Uses 0-based genomic coordinates instead of 1-based coordinates. Default: OFF.
5862
5863 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
5864 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
5865
5866 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files
5867 will be named to include the input filename and the chromosome number.
5868
5869
5870
5871 OUTPUT:
5872
5873 The bismark_methylation_extractor output is in the form:
5874 ========================================================
5875 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call>
5876
5877 * Methylated cytosines receive a '+' orientation,
5878 * Unmethylated cytosines receive a '-' orientation.
5879
5880
5881
5882 The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords):
5883 =========================================================================================================
5884
5885 track type=bedGraph (header line)
5886
5887 <chromosome> <start position> <end position> <methylation percentage>
5888
5889
5890
5891 The coverage output looks like this (tab-delimited, 1-based genomic coords; zero-based half-open coordinates available with '--zero_based'):
5892 ============================================================================================================================================
5893
5894 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated>
5895
5896
5897
5898 The genome-wide cytosine methylation output file is tab-delimited in the following format:
5899 ==========================================================================================
5900 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context>
5901
5902
5903
5904 This script was last modified on 25 February 2016.
5905
5906 HOW_TO
5907 }