comparison new/bismark_methylation_extractor @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
comparison
equal deleted inserted replaced
6:0f8646f22b8d 7:fcadce4d9a06
1 #!/usr/bin/perl
2 use warnings;
3 use strict;
4 $|++;
5 use Getopt::Long;
6 use Cwd;
7 use Carp;
8 use FindBin qw($Bin);
9 use lib "$Bin/../lib";
10
11 ## This program is Copyright (C) 2010-15, Felix Krueger (felix.krueger@babraham.ac.uk)
12
13 ## This program is free software: you can redistribute it and/or modify
14 ## it under the terms of the GNU General Public License as published by
15 ## the Free Software Foundation, either version 3 of the License, or
16 ## (at your option) any later version.
17
18 ## This program is distributed in the hope that it will be useful,
19 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ## GNU General Public License for more details.
22
23 ## You should have received a copy of the GNU General Public License
24 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
25
26 my @filenames; # input files
27 my %counting;
28 my $parent_dir = getcwd();
29
30 my %fhs;
31
32 my $version = 'v0.14.3';
33 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore) = process_commandline();
34
35
36 ### only needed for bedGraph output
37 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files
38 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
39 my @bedfiles;
40
41 ### only needed for genome-wide cytosine methylation report
42 my %chromosomes;
43
44 my %mbias_1;
45 my %mbias_2;
46
47
48 ##############################################################################################
49 ### Summarising Run Parameters
50 ##############################################################################################
51
52 ### METHYLATION EXTRACTOR
53
54 warn "Summarising Bismark methylation extractor parameters:\n";
55 warn '='x63,"\n";
56
57 if ($single){
58 if ($vanilla){
59 warn "Bismark single-end vanilla format specified\n";
60 }
61 else{
62 warn "Bismark single-end SAM format specified (default)\n"; # default
63 }
64 }
65 elsif ($paired){
66 if ($vanilla){
67 warn "Bismark paired-end vanilla format specified\n";
68 }
69 else{
70 warn "Bismark paired-end SAM format specified (default)\n"; # default
71 }
72 }
73
74 warn "Number of cores to be used: $multicore\n";
75
76 if ($single){
77 if ($ignore){
78 warn "First $ignore bp will be disregarded when processing the methylation call string\n";
79 }
80 if ($ignore_3prime){
81 warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string\n";
82 }
83
84 }
85 else{ ## paired-end
86 if ($ignore){
87 warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n";
88 }
89 if ($ignore_r2){
90 warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n";
91 }
92
93 if ($ignore_3prime){
94 warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string of Read 1\n";
95 }
96 if ($ignore_3prime_r2){
97 warn "Last $ignore_3prime_r2 bp will be disregarded when processing the methylation call string of Read 2\n";
98 }
99
100
101 }
102
103
104 if ($full){
105 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
106 }
107 if ($merge_non_CpG){
108 warn "Merge CHG and CHH context to non-CpG context specified\n";
109 }
110 ### output directory
111 if ($output_dir eq ''){
112 warn "Output will be written to the current directory ('$parent_dir')\n";
113 }
114 else{
115 warn "Output path specified as: $output_dir\n";
116 }
117
118
119 sleep (1);
120
121 ### BEDGRAPH
122
123 if ($bedGraph){
124 warn "\n\nSummarising bedGraph parameters:\n";
125 warn '='x63,"\n";
126
127 if ($counts){
128 warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage>\ncoverage format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>\n\n";
129 }
130 else{
131 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n";
132 }
133
134 ### Zero-based coordinates
135 if ($zero){
136 warn "Writing out an additional coverage file (ending in zero.cov) with 0-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n";
137 }
138
139 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n";
140
141 if ($CX_context){
142 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n";
143 }
144 else{ # default
145 $CpG_only = 1;
146 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n";
147 }
148
149 if ($remove){
150 warn "White spaces in read ID names will be removed prior to sorting\n";
151 }
152
153 if ($ample_mem){
154 warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n";
155 }
156 elsif (defined $sort_size){
157 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n";
158 }
159 else{
160 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n";
161 }
162
163
164
165 sleep (1);
166
167 if ($cytosine_report){
168 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n";
169 warn '='x63,"\n";
170 warn "Generating comprehensive genome-wide cytosine report\n(output format: <Chromosome> <Position> <Strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> )\n";
171
172
173 if ($CX_context){
174 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n";
175 }
176 else{ # default
177 $CpG_only = 1;
178 warn "Reporting cytosine methylation in CpG context only (default)\n";
179 }
180
181 if ($split_by_chromosome){
182 warn "Splitting the cytosine report output up into individual files for each chromosome\n";
183 }
184
185 ### Zero-based coordinates
186 if ($zero){
187 warn "Using zero-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n";
188 }
189 else{ # default, 1-based coords
190 warn "Using 1-based genomic coordinates (default)\n";
191 }
192
193 ### GENOME folder
194 if ($genome_folder){
195 unless ($genome_folder =~/\/$/){
196 $genome_folder =~ s/$/\//;
197 }
198 warn "Genome folder was specified as $genome_folder\n";
199 }
200 else{
201 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/';
202 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n";
203 }
204 sleep (1);
205 }
206 }
207
208 warn "\n";
209 sleep (1);
210
211 ######################################################
212 ### PROCESSING FILES
213 ######################################################
214
215 foreach my $filename (@filenames){
216 # resetting counters and filehandles
217 %fhs = ();
218 %counting =(
219 total_meCHG_count => 0,
220 total_meCHH_count => 0,
221 total_meCpG_count => 0,
222 total_unmethylated_CHG_count => 0,
223 total_unmethylated_CHH_count => 0,
224 total_unmethylated_CpG_count => 0,
225 sequences_count => 0,
226 methylation_call_strings => 0,
227 );
228
229 @sorting_files = ();
230 @bedfiles = ();
231
232 %mbias_1 = ();
233 %mbias_2 = ();
234
235
236 ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor
237 unless ($vanilla){
238 if ($paired){
239 test_positional_sorting($filename);
240 }
241 }
242
243 my ($pid,$pids,$report_basename) = process_Bismark_results_file($filename);
244
245 if ($pid == 0){
246 warn "Finished processing child process. Exiting..\n";
247
248 # ### Closing all filehandles of the child process so that the Bismark methylation extractor output doesn't get truncated due to buffering issues
249 # foreach my $fh (keys %fhs) {
250 # if ($fh =~ /^[1230]$/) {
251 # foreach my $context (keys %{$fhs{$fh}}) {
252 # $fhs{$fh}->{$context}->flush;
253 # }
254 # }
255 # else{
256 # $fhs{$fh}->flush;
257 # }
258 # }
259 exit 0;
260 }
261
262 ###
263 if ($pid and $multicore > 1){
264 warn "Now waiting for all child processes to complete\n";
265 sleep(1);
266
267 ### we need to ensure that we wait for all child processes to be finished before continuing
268 # warn "here are the child IDs: @$pids\n";
269 # warn "Looping through the child process IDs:\n";
270
271 foreach my $id (@$pids){
272 # print "$id\t";
273 my $kid = waitpid ($id,0);
274 # print "Returned: $kid\nExit status: $?\n";
275 unless ($? == 0){
276 warn "\nChild process terminated with exit signal: '$?'\n\n";
277 }
278 }
279 }
280
281 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues
282 foreach my $fh (keys %fhs) {
283 if ($fh =~ /^[1230]$/) {
284 foreach my $context (keys %{$fhs{$fh}}) {
285 close $fhs{$fh}->{$context} or die $!;
286 }
287 }
288 else{
289 close $fhs{$fh} or die $!;
290 }
291 }
292
293 ### We need to stitch together a main splitting report from all individual parent/child processes
294 if ($multicore > 1){
295 merge_individual_splitting_reports($report_basename);
296 print_splitting_report();
297
298 merge_individual_mbias_reports($report_basename); # this updates the main %mbias_1 and %mbias_2 data structures so we can proceed normally
299
300 }
301
302 unless ($mbias_off){
303 ### printing out all M-Bias data
304 produce_mbias_plots ($filename); produce_mbias_plots ($filename);
305
306 }
307
308 unless ($mbias_only){
309 delete_unused_files();
310 }
311
312 if ($bedGraph){
313
314 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
315 $out =~ s/gz$//;
316 $out =~ s/sam$//;
317 $out =~ s/bam$//;
318 $out =~ s/txt$//;
319 $out =~ s/$/bedGraph/;
320
321 my $bedGraph_output = $out;
322 my @args;
323
324 if ($remove){
325 push @args, '--remove';
326 }
327 if ($CX_context){
328 push @args, '--CX_context';
329 }
330 if ($no_header){
331 push @args, '--no_header';
332 }
333 if ($gazillion){
334 push @args, '--gazillion';
335 }
336 if ($ample_mem){
337 push @args, '--ample_memory';
338 }
339 if ($zero){
340 push @args, "--zero";
341 }
342
343 # if ($counts){
344 # push @args, "--counts";
345 # }
346
347 push @args, "--buffer_size $sort_size";
348 push @args, "--cutoff $coverage_threshold";
349 push @args, "--output $bedGraph_output";
350 push @args, "--dir '$output_dir'";
351
352 ### adding all files to be sorted to @args
353 foreach my $f (@sorting_files){
354 push @args, $f;
355 }
356
357 # print join "\t",@args,"\n";
358
359 system ("$Bin/bismark2bedGraph @args");
360
361 warn "Finished BedGraph conversion ...\n\n";
362 sleep(1);
363
364 # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!";
365 # warn "Writing bedGraph to file: $bedGraph_output\n";
366 # process_bedGraph_output();
367 # close OUT or die $!;
368
369 ### genome-wide cytosine methylation report requires bedGraph processing anyway
370 if ($cytosine_report){
371
372 @args = (); # resetting @args
373 my $cytosine_out = $out;
374 $cytosine_out =~ s/bedGraph$//;
375
376 if ($CX_context){
377 $cytosine_out =~ s/$/CX_report.txt/;
378 }
379 else{
380 $cytosine_out =~ s/$/CpG_report.txt/;
381 }
382
383 push @args, "--output $cytosine_out";
384 push @args, "--dir '$output_dir'";
385 push @args, "--genome '$genome_folder'";
386 push @args, "--parent_dir '$parent_dir'";
387
388 if ($zero){
389 push @args, "--zero";
390 }
391 if ($CX_context){
392 push @args, '--CX_context';
393 }
394 if ($split_by_chromosome){
395 push @args, '--split_by_chromosome';
396 }
397
398 my $coverage_output = $bedGraph_output;
399 $coverage_output =~ s/bedGraph$/bismark.cov/;
400
401 push @args, $output_dir . $coverage_output; # this will be the infile
402
403 system ("$Bin/coverage2cytosine @args");
404 # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out);
405 warn "\n\nFinished generating genome-wide cytosine report\n\n";
406 }
407 }
408 }
409
410 sub merge_individual_splitting_reports{
411
412 my $report_basename = shift;
413
414 my @splitting_reports; # only needed in multi-core mode to generate an overall report
415 foreach my $ext (1..$multicore){
416 push @splitting_reports, "$report_basename.$ext";
417 }
418 warn "\nMerging individual splitting reports into overall report: '$report_basename'\n";
419 warn "Merging from these individual files:\n";
420 print join ("\n",@splitting_reports),"\n\n";
421 sleep(1);
422
423 ##########
424 # resetting the counter first
425 %counting =(
426 total_meCHG_count => 0,
427 total_meCHH_count => 0,
428 total_meCpG_count => 0,
429 total_unmethylated_CHG_count => 0,
430 total_unmethylated_CHH_count => 0,
431 total_unmethylated_CpG_count => 0,
432 sequences_count => 0,
433 methylation_call_strings => 0,
434 );
435
436 # repopulating the merged counter
437 foreach my $file (@splitting_reports){
438 open (IR,$file) or die $!;
439 while (<IR>){
440 chomp;
441 my ($context,$count) = (split /\t/);
442 if ($context){
443 if ($context =~ /^Total C to T conversions in CpG context/){
444 # warn "context: $context\ncount: $count\n";
445 $counting{total_unmethylated_CpG_count} += $count;
446 }
447 elsif ($context =~ /^Total C to T conversions in CHG context/){
448 # warn "context: $context\ncount: $count\n";
449 $counting{total_unmethylated_CHG_count} += $count;
450 }
451 elsif ($context =~ /^Total C to T conversions in CHH context/){
452 # warn "context: $context\ncount: $count\n";
453 $counting{total_unmethylated_CHH_count} += $count;
454 }
455 elsif ($context =~ /^Total methylated C\'s in CpG context/){
456 # warn "context: $context\ncount: $count\n";
457 $counting{total_meCpG_count} += $count;
458 }
459 elsif ($context =~ /^Total methylated C\'s in CHG context/){
460 # warn "context: $context\ncount: $count\n";
461 $counting{total_meCHG_count} += $count;
462 }
463 elsif ($context =~ /^Total methylated C\'s in CHH context/){
464 # warn "context: $context\ncount: $count\n";
465 $counting{total_meCHH_count} += $count;
466 }
467 elsif ($context =~ /^line count/){
468 # warn "Line count\ncount: $count\n";
469 $counting{sequences_count} = $count; # always the same
470 }
471 elsif ($context =~ /^meth call strings/){
472 # warn "Meth call strings\ncount: $count\n";
473 $counting{methylation_call_strings} += $count;
474 }
475 }
476 }
477 }
478
479 # deleting the individual reports afterwards
480 foreach my $file (@splitting_reports){
481 unlink $file;
482 }
483 }
484
485
486 sub merge_individual_mbias_reports{
487
488 my $report_basename = shift;
489
490 my @mbias_reports; # only needed in multi-core mode to generate an overall report
491 foreach my $ext (1..$multicore){
492 push @mbias_reports, "$report_basename.${ext}.mbias";
493 }
494 warn "\nMerging individual M-bias reports into overall M-bias statistics from these $multicore individual files:\n";
495 print join ("\n",@mbias_reports),"\n\n";
496
497
498 ##########
499 # resetting the counters first, then repopulating them
500 %mbias_1 = ();
501 %mbias_2 = ();
502
503 # repopulating the merged counter
504 foreach my $file (@mbias_reports){
505 open (IR,$file) or die $!;
506
507 my $context;
508 my $read;
509
510 while (<IR>){
511 chomp;
512 # warn "$_\n"; sleep(1);
513 if ($_ =~ /context/){
514 $context = $1 if ($_ =~ /(\D{3}) context/);
515 # warn "Context set as $context\n";
516
517 if ($_ =~ /R2/){
518 $read = 'R2';
519 }
520 else{
521 $read = 'R1';
522 }
523 # warn "Setting read identity to '$read'\n";
524
525 # reading in 2 additional lines (===========, and header line)
526 $_ = <IR>;
527 #warn "discarding line $_\n";
528 $_ = <IR>;
529 #warn "discarding line $_\n";
530 next;
531 }
532 else{
533
534 if ($_ eq ''){
535 # empty line, only occurs after a context has finished and before a new context starts
536 next;
537 }
538
539 my ($pos,$meth,$unmeth) = (split /\t/);
540 # warn "$pos\t$meth\t$unmeth\n"; sleep(1);
541 if ($read eq 'R1'){
542 $mbias_1{$context}->{$pos}->{meth} += $meth;
543 $mbias_1{$context}->{$pos}->{un} += $unmeth;
544 }
545 elsif ($read eq 'R2'){
546 $mbias_2{$context}->{$pos}->{meth} += $meth;
547 $mbias_2{$context}->{$pos}->{un} += $unmeth;
548 }
549 }
550 }
551 close IR or warn "Had trouble closing filehandle for $file: $!\n";
552 }
553
554 # deleting the individual reports afterwards
555 foreach my $file (@mbias_reports){
556 unlink $file;
557 }
558 }
559
560
561 sub delete_unused_files{
562
563 warn "Deleting unused files ...\n\n"; sleep(1);
564
565 my $index = 0;
566
567 while ($index <= $#sorting_files){
568 if ($sorting_files[$index] =~ /gz$/){
569 open (USED,"zcat $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
570 }
571 else{
572 open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
573 }
574
575 my $used = 0;
576
577 while (<USED>){
578 next if (/^Bismark/);
579 if ($_){
580 $used = 1;
581 last;
582 }
583 }
584
585 if ($used){
586 warn "$sorting_files[$index] contains data ->\tkept\n";
587 ++$index;
588 }
589 else{
590
591 my $delete = unlink $sorting_files[$index];
592
593 if ($delete){
594 warn "$sorting_files[$index] was empty ->\tdeleted\n";
595 }
596 else{
597 warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n"
598 }
599
600 ### we also need to remove the element from @sorting_files
601 splice @sorting_files, $index, 1;
602 }
603 }
604 warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately)
605 }
606
607 sub produce_mbias_plots{
608
609 my $filename = shift;
610
611 my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
612 $mbias =~ s/gz$//;
613 $mbias =~ s/sam$//;
614 $mbias =~ s/bam$//;
615 $mbias =~ s/txt$//;
616 my $mbias_graph_1 = my $mbias_graph_2 = $mbias;
617 $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png';
618 $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png';
619
620 $mbias =~ s/$/M-bias.txt/;
621
622 open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n";
623
624 # determining maximum read length
625 my $max_length_1 = 0;
626 my $max_length_2 = 0;
627
628 foreach my $context (keys %mbias_1){
629 foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){
630 $max_length_1 = $pos unless ($max_length_1 >= $pos);
631 }
632 }
633 if ($paired){
634 foreach my $context (keys %mbias_2){
635 foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){
636 $max_length_2 = $pos unless ($max_length_2 >= $pos);
637 }
638 }
639 }
640
641 if ($single){
642 warn "Determining maximum read length for M-Bias plot\n";
643 warn "Maximum read length of Read 1: $max_length_1\n\n";
644 }
645 else{
646 warn "Determining maximum read lengths for M-Bias plots\n";
647 warn "Maximum read length of Read 1: $max_length_1\n";
648 warn "Maximum read length of Read 2: $max_length_2\n\n";
649 }
650 # sleep(3);
651
652 my @mbias_read1;
653 my @mbias_read2;
654
655 #Check whether the module GD::Graph:lines is installed
656 my $gd_graph_installed = 0;
657 eval{
658 require GD::Graph::lines;
659 GD::Graph::lines->import();
660 };
661
662 unless($@) { # syntax or routine error variable, set if something goes wrong in the last eval{ require ...}
663 $gd_graph_installed = 1;
664
665 #Check whether the module GD::Graph::colour is installed
666 eval{
667 require GD::Graph::colour;
668 GD::Graph::colour->import(qw(:colours :lists :files :convert));
669 };
670
671 if ($@) {
672 warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
673 sleep(2);
674 $gd_graph_installed = 0;
675 }
676
677
678 }
679 else{
680 warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
681 sleep(2);
682 }
683
684
685 my $graph_title;
686 my $graph1;
687 my $graph2;
688
689 if ( $gd_graph_installed){
690 $graph1 = GD::Graph::lines->new(800,600);
691 if ($paired){
692 $graph2 = GD::Graph::lines->new(800,600);
693 }
694 }
695
696 foreach my $context (qw(CpG CHG CHH)){
697 @{$mbias_read1[0]} = ();
698
699 if ($paired){
700 print MBIAS "$context context (R1)\n================\n";
701 $graph_title = 'M-bias (Read 1)';
702 }
703 else{
704 print MBIAS "$context context\n===========\n";
705 $graph_title = 'M-bias';
706 }
707 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
708
709 foreach my $pos (1..$max_length_1){
710
711 unless (defined $mbias_1{$context}->{$pos}->{meth}){
712 $mbias_1{$context}->{$pos}->{meth} = 0;
713 }
714 unless (defined $mbias_1{$context}->{$pos}->{un}){
715 $mbias_1{$context}->{$pos}->{un} = 0;
716 }
717
718 my $percent = '';
719 if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){
720 $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) );
721 }
722 my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth};
723
724 print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n";
725 push @{$mbias_read1[0]},$pos;
726
727 if ($context eq 'CpG'){
728 push @{$mbias_read1[1]},$percent;
729 push @{$mbias_read1[4]},$coverage;
730 }
731 elsif ($context eq 'CHG'){
732 push @{$mbias_read1[2]},$percent;
733 push @{$mbias_read1[5]},$coverage;
734 }
735 elsif ($context eq 'CHH'){
736 push @{$mbias_read1[3]},$percent;
737 push @{$mbias_read1[6]},$coverage;
738 }
739 }
740 print MBIAS "\n";
741 }
742
743 if ( $gd_graph_installed){
744
745 add_colour(nice_blue => [31,120,180]);
746 add_colour(nice_orange => [255,127,0]);
747 add_colour(nice_green => [51,160,44]);
748 add_colour(pale_blue => [153,206,227]);
749 add_colour(pale_orange => [253,204,138]);
750 add_colour(pale_green => [191,230,207]);
751
752 $graph1->set(
753 x_label => 'position (bp)',
754 y1_label => '% methylation',
755 y2_label => '# methylation calls',
756 title => $graph_title,
757 line_width => 2,
758 x_max_value => $max_length_1,
759 x_min_value => 0,
760 y_tick_number => 10,
761 y_label_skip => 2,
762 y1_max_value => 100,
763 y1_min_value => 0,
764 y_label_skip => 2,
765 y2_min_value => 0,
766 x_label_skip => 5,
767 x_label_position => 0.5,
768 x_tick_offset => -1,
769 bgclr => 'white',
770 transparent => 0,
771 two_axes => 1,
772 use_axis => [1,1,1,2,2,2],
773 legend_placement => 'RC',
774 legend_spacing => 6,
775 legend_marker_width => 24,
776 legend_marker_height => 18,
777 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
778 ) or die $graph1->error;
779
780 $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
781
782 ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014)
783 if (my $gd1 = $graph1->plot(\@mbias_read1)) {
784 open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n";
785 binmode MBIAS_G1;
786 print MBIAS_G1 $gd1->png;
787 }
788 else {
789 warn "WARNING: Cannot generate read 1 M-bias plot: " , $graph1->error , "\n\n";
790 }
791
792 # my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error;
793 # open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n";
794 # binmode MBIAS_G1;
795 # print MBIAS_G1 $gd1->png;
796 }
797
798 if ($paired){
799
800 foreach my $context (qw(CpG CHG CHH)){
801 @{$mbias_read2[0]} = ();
802
803 print MBIAS "$context context (R2)\n================\n";
804 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
805 foreach my $pos (1..$max_length_2){
806
807 unless (defined $mbias_2{$context}->{$pos}->{meth}){
808 $mbias_2{$context}->{$pos}->{meth} = 0;
809 }
810 unless (defined $mbias_2{$context}->{$pos}->{un}){
811 $mbias_2{$context}->{$pos}->{un} = 0;
812 }
813
814 my $percent = '';
815 if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){
816 $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) );
817 }
818 my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth};
819
820 print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n";
821
822 push @{$mbias_read2[0]},$pos;
823
824 if ($context eq 'CpG'){
825 push @{$mbias_read2[1]},$percent;
826 push @{$mbias_read2[4]},$coverage;
827 }
828 elsif ($context eq 'CHG'){
829 push @{$mbias_read2[2]},$percent;
830 push @{$mbias_read2[5]},$coverage;
831 }
832 elsif ($context eq 'CHH'){
833 push @{$mbias_read2[3]},$percent;
834 push @{$mbias_read2[6]},$coverage;
835 }
836 }
837 print MBIAS "\n";
838 }
839
840 if ( $gd_graph_installed){
841
842 add_colour(nice_blue => [31,120,180]);
843 add_colour(nice_orange => [255,127,0]);
844 add_colour(nice_green => [51,160,44]);
845 add_colour(pale_blue => [153,206,227]);
846 add_colour(pale_orange => [253,204,138]);
847 add_colour(pale_green => [191,230,207]);
848
849 $graph2->set(
850 x_label => 'position (bp)',
851 line_width => 2,
852 x_max_value => $max_length_1,
853 x_min_value => 0,
854 y_tick_number => 10,
855 y_label_skip => 2,
856 y1_max_value => 100,
857 y1_min_value => 0,
858 y_label_skip => 2,
859 y2_min_value => 0,
860 x_label_skip => 5,
861 x_label_position => 0.5,
862 x_tick_offset => -1,
863 bgclr => 'white',
864 transparent => 0,
865 two_axes => 1,
866 use_axis => [1,1,1,2,2,2],
867 legend_placement => 'RC',
868 legend_spacing => 6,
869 legend_marker_width => 24,
870 legend_marker_height => 18,
871 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
872 x_label => 'position (bp)',
873 y1_label => '% methylation',
874 y2_label => '# calls',
875 title => 'M-bias (Read 2)',
876 ) or die $graph2->error;
877
878 $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
879
880 ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014)
881 if (my $gd2 = $graph2->plot(\@mbias_read2)) {
882 open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n";
883 binmode MBIAS_G2;
884 print MBIAS_G2 $gd2->png;
885 }
886 else {
887 warn "WARNING: Cannot generate Read 2 M-bias plot: " , $graph2->error , "\n\n";
888 }
889
890 # my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error;
891 # open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n";
892 # binmode MBIAS_G2;
893 # print MBIAS_G2 $gd2->png;
894
895 }
896 }
897 }
898
899 sub process_commandline{
900 my $help;
901 my $single_end;
902 my $paired_end;
903 my $ignore;
904 my $ignore_r2;
905 my $genomic_fasta;
906 my $full;
907 my $report;
908 my $extractor_version;
909 my $no_overlap;
910 my $merge_non_CpG;
911 my $vanilla;
912 my $output_dir;
913 my $no_header;
914 my $bedGraph;
915 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status
916 my $remove;
917 my $counts;
918 my $cytosine_report;
919 my $genome_folder;
920 my $zero;
921 my $CpG_only;
922 my $CX_context;
923 my $split_by_chromosome;
924 my $sort_size;
925 my $samtools_path;
926 my $gzip;
927 my $mbias_only;
928 my $mbias_off;
929 my $gazillion;
930 my $ample_mem;
931 my $include_overlap;
932 my $ignore_3prime;
933 my $ignore_3prime_r2;
934 my $multicore;
935
936 my $command_line = GetOptions ('help|man' => \$help,
937 'p|paired-end' => \$paired_end,
938 's|single-end' => \$single_end,
939 'fasta' => \$genomic_fasta,
940 'ignore=i' => \$ignore,
941 'ignore_r2=i' => \$ignore_r2,
942 'comprehensive' => \$full,
943 'report' => \$report,
944 'version' => \$extractor_version,
945 'no_overlap' => \$no_overlap,
946 'merge_non_CpG' => \$merge_non_CpG,
947 'vanilla' => \$vanilla,
948 'o|output=s' => \$output_dir,
949 'no_header' => \$no_header,
950 'bedGraph' => \$bedGraph,
951 "cutoff=i" => \$coverage_threshold,
952 "remove_spaces" => \$remove,
953 "counts" => \$counts,
954 "cytosine_report" => \$cytosine_report,
955 'g|genome_folder=s' => \$genome_folder,
956 "zero_based" => \$zero,
957 "CX|CX_context" => \$CX_context,
958 "split_by_chromosome" => \$split_by_chromosome,
959 "buffer_size=s" => \$sort_size,
960 'samtools_path=s' => \$samtools_path,
961 'gzip' => \$gzip,
962 'mbias_only' => \$mbias_only,
963 'mbias_off' => \$mbias_off,
964 'gazillion|scaffolds' => \$gazillion,
965 'ample_memory' => \$ample_mem,
966 'include_overlap' => \$include_overlap,
967 'ignore_3prime=i' => \$ignore_3prime,
968 'ignore_3prime_r2=i' => \$ignore_3prime_r2,
969 'multicore=i' => \$multicore,
970 );
971
972 ### EXIT ON ERROR if there were errors with any of the supplied options
973 unless ($command_line){
974 die "Please respecify command line options\n";
975 }
976
977 ### HELPFILE
978 if ($help){
979 print_helpfile();
980 exit;
981 }
982
983 if ($extractor_version){
984 print << "VERSION";
985
986
987 Bismark Methylation Extractor
988
989 Bismark Extractor Version: $version
990 Copyright 2010-15 Felix Krueger, Babraham Bioinformatics
991 www.bioinformatics.babraham.ac.uk/projects/bismark/
992
993
994 VERSION
995 exit;
996 }
997
998
999 ### no files provided
1000 unless (@ARGV){
1001 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
1002 }
1003 @filenames = @ARGV;
1004
1005 warn "\n *** Bismark methylation extractor version $version ***\n\n";
1006
1007 ### M-BIAS ONLY
1008 if ($mbias_only){
1009
1010 if ($mbias_off){
1011 die "Options '--mbias_only' and '--mbias_off' are not compatible. Just pick one, mkay?\n";
1012 }
1013 if ($bedGraph){
1014 die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n";
1015 }
1016 if ($cytosine_report){
1017 die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n";
1018 }
1019 if ($merge_non_CpG){
1020 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n";
1021 }
1022 if ($full){
1023 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n";
1024 }
1025 sleep(3);
1026 }
1027
1028 ### PRINT A REPORT
1029 unless ($report){
1030 $report = 1; # making this the default
1031 }
1032
1033 ### OUTPUT DIR PATH
1034 if ($output_dir){
1035 unless ($output_dir =~ /\/$/){
1036 $output_dir =~ s/$/\//;
1037 }
1038 }
1039 else{
1040 $output_dir = '';
1041 }
1042
1043 ### NO HEADER
1044 unless ($no_header){
1045 $no_header = 0;
1046 }
1047
1048 ### OLD (VANILLA) OUTPUT FORMAT
1049 unless ($vanilla){
1050 $vanilla = 0;
1051 }
1052
1053 if ($single_end){
1054 $paired_end = 0; ### SINGLE END ALIGNMENTS
1055 }
1056 elsif ($paired_end){
1057 $single_end = 0; ### PAIRED-END ALIGNMENTS
1058 }
1059 else{
1060
1061 ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header
1062
1063 if ($vanilla){
1064 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n";
1065 }
1066 else{ # SAM/BAM format
1067
1068 my $file = $filenames[0];
1069 warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1);
1070
1071 ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file
1072 if ($file =~ /\.gz$/){
1073 open (DETERMINE,"zcat $file |") or die "Unable to read from gzipped file $file: $!\n";
1074 }
1075 elsif ($file =~ /\.bam$/ || isBam($file) ){ ### this would allow to read BAM files that do not end in *.bam
1076 open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM file $file: $!\n";
1077 }
1078 else{
1079 open (DETERMINE,$file) or die "Unable to read from $file: $!\n";
1080 }
1081
1082 while (<DETERMINE>){
1083 last unless (/^\@/);
1084 if ($_ =~ /^\@PG/){
1085 # warn "found the \@PG line:\n";
1086 # warn "$_";
1087
1088 if ($_ =~ /\s+-1\s+/ and $_ =~ /\s+-2\s+/){
1089 warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1);
1090 $paired_end = 1;
1091 $single_end = 0;
1092 }
1093 else{
1094 warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1);
1095 $paired_end = 0;
1096 $single_end = 1;
1097 }
1098 }
1099 }
1100
1101 # close DETERMINE or warn $!; # this always throws an error anyway...
1102
1103 }
1104 }
1105
1106 ### IGNORING <INT> 5' END
1107 # bases at the start of the read when processing the methylation call string
1108 unless ($ignore){
1109 $ignore = 0;
1110 }
1111
1112 if (defined $ignore_r2){
1113 die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end);
1114 }
1115 else{
1116 $ignore_r2 = 0;
1117 }
1118
1119 ### IGNORING <INT> 3' END
1120 # bases at the end of the read when processing the methylation call string
1121 unless ($ignore_3prime){
1122 $ignore_3prime = 0;
1123 }
1124
1125 if (defined $ignore_3prime_r2){
1126 die "You can only specify --ignore_3prime_r2 for paired-end result files\n" unless ($paired_end);
1127 }
1128 else{
1129 $ignore_3prime_r2 = 0;
1130 }
1131
1132
1133 ### NO OVERLAP
1134 ### --no_overlap is the default (as of version 0.12.6), unless someone explicitly asks to include overlaps
1135 if ($include_overlap){
1136 die "The option '--include_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
1137 warn "Setting option '--inlcude_overlap' for paired-end data (user-defined)\n\n";
1138 $no_overlap = 0;
1139 }
1140 else{ # default
1141 if ($paired_end){
1142 warn "Setting option '--no_overlap' since this is (normally) the right thing to do for paired-end data\n\n";
1143 $no_overlap = 1;
1144 }
1145 }
1146
1147 ### COMPREHENSIVE OUTPUT
1148 unless ($full){
1149 $full = 0;
1150 }
1151
1152 ### MERGE NON-CpG context
1153 unless ($merge_non_CpG){
1154 $merge_non_CpG = 0;
1155 }
1156
1157 ### remove white spaces in read ID (needed for sorting using the sort command
1158 unless ($remove){
1159 $remove = 0;
1160 }
1161
1162 ### COVERAGE THRESHOLD FOR bedGraph OUTPUT
1163 if (defined $coverage_threshold){
1164 unless ($coverage_threshold > 0){
1165 die "Please select a coverage greater than 0 (positive integers only)\n";
1166 }
1167 }
1168 else{
1169 $coverage_threshold = 1;
1170 }
1171
1172 ### SORT buffer size
1173 if (defined $sort_size){
1174 unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){
1175 die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n";
1176 }
1177 }
1178 else{
1179 $sort_size = '2G';
1180 }
1181
1182 if ($zero){
1183 die "Option '--zero' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
1184 }
1185
1186 if ($CX_context){
1187 die "Option '--CX_context' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
1188 }
1189 else{
1190 $CX_context = 0;
1191 }
1192
1193 unless ($counts){
1194 $counts = 1; # counts will always be set
1195 }
1196
1197 if ($cytosine_report){
1198
1199 ### GENOME folder
1200 if ($genome_folder){
1201 unless ($genome_folder =~/\/$/){
1202 $genome_folder =~ s/$/\//;
1203 }
1204 }
1205 else{
1206 die "Please specify a genome folder to proceed (full path only)\n";
1207 }
1208
1209 unless ($bedGraph){
1210 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n";
1211 $bedGraph = 1;
1212 }
1213 unless ($counts){
1214 # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n";
1215 $counts = 1;
1216 }
1217 warn "\n";
1218 }
1219
1220 ### PATH TO SAMTOOLS
1221 if (defined $samtools_path){
1222 # if Samtools was specified as full command
1223 if ($samtools_path =~ /samtools$/){
1224 if (-e $samtools_path){
1225 # Samtools executable found
1226 }
1227 else{
1228 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
1229 }
1230 }
1231 else{
1232 unless ($samtools_path =~ /\/$/){
1233 $samtools_path =~ s/$/\//;
1234 }
1235 $samtools_path .= 'samtools';
1236 if (-e $samtools_path){
1237 # Samtools executable found
1238 }
1239 else{
1240 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
1241 }
1242 }
1243 }
1244 # Check whether Samtools is in the PATH if no path was supplied by the user
1245 else{
1246 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
1247 $samtools_path = `which samtools`;
1248 chomp $samtools_path;
1249 }
1250 }
1251
1252 unless (defined $samtools_path){
1253 $samtools_path = '';
1254 }
1255
1256
1257 if ($gazillion){
1258 if ($ample_mem){
1259 die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n";
1260 }
1261 }
1262
1263 if (defined $multicore){
1264 unless ($multicore > 0){
1265 die "Core usage needs to be set to 1 or more (currently selected $multicore). Please respecify!\n";
1266 }
1267 if ($multicore > 20){
1268 warn "Core usage currently set to more than 20 threads. Let's see how this goes... (set value: $multicore)\n\n";
1269 }
1270 }
1271 else{
1272 $multicore = 1; # default. Single-thread mode
1273 warn "Setting core usage to single-threaded (default). Consider using --multicore <int> to speed up the extraction process.\n\n";
1274 }
1275
1276 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore);
1277 }
1278
1279
1280 sub test_positional_sorting{
1281
1282 my $filename = shift;
1283
1284 print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t";
1285 sleep(1);
1286
1287 if ($filename =~ /\.gz$/) {
1288 open (TEST,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
1289 }
1290 elsif ($filename =~ /bam$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam
1291 if ($samtools_path){
1292 open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
1293 }
1294 else{
1295 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
1296 }
1297 }
1298 else {
1299 open (TEST,$filename) or die "Can't open file $filename: $!\n";
1300 }
1301
1302 my $count = 0;
1303
1304 while (<TEST>) {
1305 if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted)
1306 if (/^\@SO/) {
1307 die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n";
1308 }
1309 next;
1310 }
1311 $count++;
1312
1313 last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID
1314
1315 my ($id_1) = (split (/\t/));
1316
1317 ### reading the next line which should be read 2
1318 $_ = <TEST>;
1319 my ($id_2) = (split (/\t/));
1320 last unless ($id_2);
1321 ++$count;
1322
1323 if ($id_1 eq $id_2){
1324 ### ids are the same
1325 next;
1326 }
1327 else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first
1328 my $id_1_trunc = $id_1;
1329 $id_1_trunc =~ s/\/1$//;
1330 my $id_2_trunc = $id_2;
1331 $id_2_trunc =~ s/\/2$//;
1332
1333 unless ($id_1_trunc eq $id_2_trunc){
1334 die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n";
1335 }
1336 }
1337 }
1338 # close TEST or die $!; somehow fails on our cluster...
1339 ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other)
1340 warn "...passed!\n";
1341 sleep(1);
1342
1343 }
1344
1345 sub process_Bismark_results_file{
1346
1347 my $filename = shift;
1348 my $report_filename = open_output_filehandles($filename);
1349
1350 ### disabling buffering so we don't run into problems with half written out lines...
1351 foreach my $fh (keys %fhs){
1352 if ($fh =~ /^[1230]$/) {
1353 foreach my $context (keys %{$fhs{$fh}}) {
1354 select($fhs{$fh}->{$context});
1355 $|++;
1356 }
1357 }
1358 else{
1359 select($fhs{$fh});
1360 $|++;
1361 }
1362 }
1363 select(STDOUT);
1364
1365 ################################################
1366 ################################################
1367 ### multi-process handling
1368 ###
1369
1370 my $offset = 1;
1371 my $process_id;
1372 my @pids;
1373 if ($multicore > 1){
1374
1375 until ($offset == $multicore){
1376 # warn "multicore: $multicore\noffset: $offset\n";
1377 my $fork = fork;
1378
1379 if (defined $fork){
1380 if ($fork != 0){
1381 $process_id = $fork;
1382 push @pids, $process_id;
1383 if ($offset < $multicore){
1384 ++$offset;
1385 # warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n";
1386 }
1387 else{
1388 # warn "Reached the number of maximum multicores. Proceeeding to processing...\n";
1389 }
1390 }
1391 elsif ($fork == 0){
1392 # warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n";
1393 $process_id = $fork;
1394 last;
1395 }
1396 }
1397 else{
1398 die "Forking unsuccessful. Proceeding using a single thread only\n";
1399 }
1400 }
1401
1402 # warn "\nThe thread identity\n===================\n";
1403 if ($process_id){
1404 # print "I am the parent process. My children are called:\n";
1405 # print join ("\t",@pids),"\n";
1406 # print "I am going to process the following line count: $offset\n\n";
1407 }
1408 elsif($process_id == 0){
1409 # warn "I am a child process: Process ID: $process_id\n";
1410 # warn "I am going to process the following line count: $offset\n\n";
1411 }
1412 else{
1413 die "Process ID was: $process_id\n";
1414 }
1415 }
1416 else{
1417 # warn "Single-core mode: setting pid to 1\n";
1418 $process_id = 1;
1419 }
1420
1421 ################################################
1422 ################################################
1423 if ($process_id){
1424 warn "Now reading in Bismark result file $filename\n";
1425 }
1426 else{
1427 warn "\nNow reading in Bismark result file $filename\n\n";
1428 }
1429
1430 if ($filename =~ /\.gz$/) {
1431 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
1432 }
1433 elsif ($filename =~ /bam$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam
1434 if ($samtools_path){
1435 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
1436 }
1437 else{
1438 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
1439 }
1440 }
1441 else {
1442 open (IN,$filename) or die "Can't open file $filename: $!\n";
1443 }
1444
1445 ### Vanilla and SAM output need to read different numbers of header lines
1446 if ($vanilla) {
1447 my $bismark_version = <IN>; ## discarding the Bismark version info
1448 chomp $bismark_version;
1449 $bismark_version =~ s/\r//; # replaces \r line feed
1450 $bismark_version =~ s/Bismark version: //;
1451 if ($bismark_version =~ /^\@/) {
1452 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
1453 sleep (1);
1454 }
1455
1456 unless ($version eq $bismark_version){
1457 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
1458 }
1459 } else {
1460 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
1461 # We are reading from it further down
1462 }
1463
1464 my $methylation_call_strings_processed = 0;
1465 my $line_count = 0;
1466
1467 ### proceeding differently now for single-end or paired-end Bismark files
1468
1469 ### PROCESSING SINGLE-END RESULT FILES
1470 if ($single) {
1471
1472 ### also proceeding differently now for SAM format or vanilla Bismark format files
1473 if ($vanilla) { # old vanilla Bismark output format
1474 while (<IN>) {
1475 ++$line_count;
1476 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
1477
1478 if ( ($line_count - $offset)%$multicore == 0){
1479 # warn "line count: $line_count\noffset: $offset\n";
1480 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
1481 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
1482 }
1483 else{
1484 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
1485 next;
1486 }
1487
1488 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
1489 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
1490
1491 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
1492 ### last position
1493 chomp $genome_conversion;
1494
1495 my $index;
1496 if ($meth_call) {
1497
1498 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
1499 $index = 0;
1500 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
1501 $index = 1;
1502 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
1503 $index = 3;
1504 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
1505 $index = 2;
1506 } else {
1507 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
1508 }
1509
1510 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
1511 if ($ignore) {
1512 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
1513
1514 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
1515 if ($strand eq '+') {
1516 $start += $ignore;
1517 }
1518 elsif ($strand eq '-') {
1519 $start += length($meth_call)-1; ## $meth_call is already shortened!
1520 }
1521 else {
1522 die "Alignment did not have proper strand information: $strand\n";
1523 }
1524 }
1525
1526 ### Clipping off the last <int> number of bases from the methylation call string as specified with --ignore_3prime <int>
1527 if ($ignore_3prime) {
1528
1529 $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime);
1530
1531 ### If we are clipping off some bases at the end we need to adjust the end position of the alignments accordingly
1532 if ($strand eq '+') {
1533 # clipping the 3' end does not affect the starting position # ignore 5' has already been taken care of, if relevant at all
1534 }
1535 elsif ($strand eq '-') {
1536 # here we need to discriminate if the start has been adjusted because of --ignore or not
1537 if ($ignore){
1538 # position adjusted already, and because of this 3' trimming is irrelevant for the start position
1539 }
1540 else{
1541 # Here we need to add the length ignore_3prime to the read starting position, adjustment of the start position will take place later in the methylation extraction step
1542 $start += $ignore_3prime;
1543 }
1544 }
1545 else {
1546 die "Alignment did not have proper strand information: $strand\n";
1547 }
1548 }
1549 ### just as a comment, if --ignore has not been specified the starting position of reverse reads is adjusted later at the methylation extraction stage
1550
1551 ### printing out the methylation state of every C in the read
1552 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
1553
1554 ++$methylation_call_strings_processed; # 1 per single-end result
1555 }
1556 }
1557 } else { # processing single-end SAM format (default)
1558 while (<IN>) {
1559 chomp;
1560 ### SAM format can either start with header lines (starting with @) or start with alignments directly
1561 if (/^\@/) { # skipping header lines (starting with @)
1562 warn "skipping SAM header line:\t$_\n" unless ($process_id == 0);
1563 next;
1564 }
1565
1566 ++$line_count;
1567 # warn "$line_count\n";
1568 warn "Processed lines: $line_count\n" if ($line_count%500000 == 0);
1569
1570 if ( ($line_count - $offset)%$multicore == 0){
1571 # warn "line count: $line_count\noffset: $offset\n";
1572 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
1573 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
1574 }
1575 else{
1576 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
1577 next;
1578 }
1579
1580 # example read in SAM format
1581 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
1582 ###
1583
1584 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
1585 # < 0.7.6 $meth_call =~ s/^XM:Z://;
1586 # < 0.7.6 $read_conversion =~ s/^XR:Z://;
1587 # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
1588
1589 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
1590
1591 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
1592 my $meth_call; ### Thanks to Zachary Zeno for this solution
1593 my $read_conversion;
1594 my $genome_conversion;
1595
1596 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
1597 my $tag = $1;
1598 my $value = $2;
1599
1600 if ($tag eq "XM") {
1601 $meth_call = $value;
1602 $meth_call =~ s/\r//;
1603 } elsif ($tag eq "XR") {
1604 $read_conversion = $value;
1605 $read_conversion =~ s/\r//;
1606 } elsif ($tag eq "XG") {
1607 $genome_conversion = $value;
1608 $genome_conversion =~ s/\r//;
1609 }
1610 }
1611
1612 my $strand;
1613 # warn "$meth_call\n$read_conversion\n$genome_conversion\n";
1614
1615 my $index;
1616 if ($meth_call) {
1617 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
1618 $index = 0;
1619 $strand = '+';
1620 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
1621 $index = 1;
1622 $strand = '-';
1623 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
1624 $index = 2;
1625 $strand = '+';
1626 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
1627 $index = 3;
1628 $strand = '-';
1629 } else {
1630 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
1631 }
1632
1633 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
1634 if ($strand eq '-') {
1635 $meth_call = reverse $meth_call;
1636 }
1637 # warn "\n$meth_call\n";
1638
1639 ### IGNORE 5 PRIME: Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
1640 if ($ignore) {
1641 # warn "\n\n$meth_call\n";
1642 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
1643 # warn "$meth_call\n";sleep(1);
1644
1645 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
1646
1647 my @len = split (/\D+/,$cigar); # storing the length per operation
1648 my @ops = split (/\d+/,$cigar); # storing the operation
1649 shift @ops; # remove the empty first element
1650 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
1651
1652 my @comp_cigar; # building an array with all CIGAR operations
1653 foreach my $index (0..$#len) {
1654 foreach (1..$len[$index]) {
1655 # print "$ops[$index]";
1656 push @comp_cigar, $ops[$index];
1657 }
1658 }
1659 # print "original CIGAR: $cigar\n";
1660 # print "original CIGAR: @comp_cigar\n";
1661
1662 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
1663 if ($strand eq '+') {
1664
1665 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
1666 my $I_count = 0;
1667
1668 for (1..$ignore) {
1669 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
1670 # print "$_ deleted $op\n";
1671
1672 while ($op eq 'D') { # repeating this for deletions (D)
1673 $D_count++;
1674 $op = shift @comp_cigar;
1675 # print "$_ deleted $op\n";
1676 }
1677 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
1678 $I_count++;
1679 }
1680 }
1681 $start += $ignore + $D_count - $I_count;
1682 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
1683 }
1684 elsif ($strand eq '-') {
1685
1686 for (1..$ignore) {
1687 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
1688 while ($op eq 'D') { # repeating this for deletions (D)
1689 $op = pop @comp_cigar;
1690 }
1691 }
1692
1693 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
1694 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
1695 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
1696 foreach (@comp_cigar) {
1697 ++$MD_count if ($_ eq 'M' or $_ eq 'D');
1698 }
1699 $start += $MD_count - 1;
1700 }
1701
1702 ### reconstituting shortened CIGAR string
1703 my $new_cigar;
1704 my $count = 0;
1705 my $last_op;
1706 # print "ignore adjusted: @comp_cigar\n";
1707 foreach my $op (@comp_cigar) {
1708 unless (defined $last_op){
1709 $last_op = $op;
1710 ++$count;
1711 next;
1712 }
1713 if ($last_op eq $op) {
1714 ++$count;
1715 } else {
1716 $new_cigar .= "$count$last_op";
1717 $last_op = $op;
1718 $count = 1;
1719 }
1720 }
1721 $new_cigar .= "$count$last_op"; # appending the last operation and count
1722 $cigar = $new_cigar;
1723 # print "ignore adjusted scalar: $cigar\n";
1724 }
1725
1726 #######################
1727 ### INGORE 3' END ###
1728 #######################
1729
1730 # Clipping off the last <int> number of bases from the methylation call string as specified with --ignore_3prime <int>
1731 if ($ignore_3prime) {
1732 # warn "$meth_call\n";
1733 $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime);
1734 # warn "$meth_call\n";sleep(1);
1735
1736 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
1737
1738 my @len = split (/\D+/,$cigar); # storing the length per operation
1739 my @ops = split (/\d+/,$cigar); # storing the operation
1740 shift @ops; # remove the empty first element
1741 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
1742
1743 my @comp_cigar; # building an array with all CIGAR operations
1744 foreach my $index (0..$#len) {
1745 foreach (1..$len[$index]) {
1746 # print "$ops[$index]";
1747 push @comp_cigar, $ops[$index];
1748 }
1749 }
1750
1751 # print "original CIGAR: $cigar\n";
1752 # print join ("",@comp_cigar),"\n";
1753
1754 ### If we are clipping off some bases at the end we might have to adjust the start position of the alignments accordingly
1755 if ($strand eq '+') {
1756
1757 ### clipping the 3' end does not affect the starting position of forward strand alignments
1758 # ignore 5' has already been taken care of at this stage, if relevant at all
1759
1760 for (1..$ignore_3prime) {
1761 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations from the end
1762 # print join ("",@comp_cigar),"\n";
1763 # print "$_ deleted $op from 3' end\n";
1764 while ($op eq 'D') { # repeating this for deletions (D)
1765 $op = pop @comp_cigar;
1766 # print join ("",@comp_cigar),"\n";
1767 # print "$_ deleted $op from 3' end\n";
1768 }
1769 }
1770 # print "Final truncated CIGAR string:\n";
1771 # print join ("",@comp_cigar),"\n";
1772 # $start += $ignore + $D_count - $I_count;
1773 # print "start $start\t ignore_3prime: $ignore_3prime\t D count: $D_count I_count: $I_count\n";
1774 }
1775 elsif ($strand eq '-') {
1776
1777 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
1778 my $I_count = 0;
1779
1780 for (1..$ignore_3prime) {
1781 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array
1782 while ($op eq 'D') { # repeating this for deletions (D)
1783 $D_count++;
1784 $op = shift @comp_cigar;
1785 }
1786 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
1787 $I_count++;
1788 }
1789
1790 }
1791
1792 # here we need to discriminate if the start has been adjusted because of --ignore or not
1793 if ($ignore){
1794 # the start position has already been modified for --ignore already, so we don't have to adjust the position again now
1795 }
1796 else{
1797 # Here we need to add the length ignore_3prime to the read starting position
1798 # adjustment of the true starting position of this reverse read will take place later in the methylation extraction step
1799 $start += $ignore_3prime + $D_count - $I_count;
1800 }
1801
1802 }
1803
1804 ### reconstituting shortened CIGAR string
1805 my $new_cigar;
1806 my $count = 0;
1807 my $last_op;
1808 # print "ignore_3prime adjusted:\n"; print join ("",@comp_cigar),"\n";
1809 foreach my $op (@comp_cigar) {
1810 unless (defined $last_op){
1811 $last_op = $op;
1812 ++$count;
1813 next;
1814 }
1815 if ($last_op eq $op) {
1816 ++$count;
1817 } else {
1818 $new_cigar .= "$count$last_op";
1819 $last_op = $op;
1820 $count = 1;
1821 }
1822 }
1823 $new_cigar .= "$count$last_op"; # appending the last operation and count
1824 $cigar = $new_cigar;
1825 # print "ignore_3prime adjusted scalar: $cigar\n";
1826 }
1827 }
1828 ### printing out the methylation state of every C in the read
1829 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
1830
1831 ++$methylation_call_strings_processed; # 1 per single-end result
1832 }
1833 }
1834 }
1835
1836 ### PROCESSING PAIRED-END RESULT FILES
1837 elsif ($paired) {
1838
1839 ### proceeding differently now for SAM format or vanilla Bismark format files
1840 if ($vanilla) { # old vanilla Bismark paired-end output format
1841 while (<IN>) {
1842 ++$line_count;
1843 warn "processed line: $line_count\n" if ($line_count%500000 == 0);
1844
1845 if ( ($line_count - $offset)%$multicore == 0){
1846 # warn "line count: $line_count\noffset: $offset\n";
1847 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
1848 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
1849 }
1850 else{
1851 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
1852 next;
1853 }
1854
1855 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
1856 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
1857
1858 my $index;
1859 chomp $genome_conversion;
1860
1861 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
1862 $index = 0; ## this is OT
1863 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
1864 $index = 2; ## this is CTOB!!!
1865 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
1866 $index = 1; ## this is CTOT!!!
1867 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
1868 $index = 3; ## this is OB
1869 } else {
1870 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
1871 }
1872
1873 if ($meth_call_1 and $meth_call_2) {
1874 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
1875
1876 ### IGNORE FROM 5' END
1877 if ($ignore) {
1878 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
1879
1880 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
1881 $start_read_1 += $ignore;
1882 }
1883 if ($ignore_r2) {
1884 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
1885
1886 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified
1887 $end_read_2 -= $ignore_r2;
1888 }
1889
1890 ### IGNORE 3' END
1891
1892 ### Clipping off the last <int> number of bases from the methylation call string of Read 1 as specified with --ignore_3prime <int>
1893 if ($ignore_3prime) {
1894 $meth_call_1 = substr($meth_call_1,0, length($meth_call_1) - $ignore_3prime);
1895 # we don't have to adjust the position now since the shortened methylation call will be fine, see below
1896 }
1897 ### Clipping off the last <int> number of bases from the methylation call string of Read 2 as specified with --ignore_3prime_r2 <int>
1898 if ($ignore_3prime_r2) {
1899 $meth_call_2 = substr($meth_call_2,0, length($meth_call_2) - $ignore_3prime_r2);
1900 # we don't have to adjust the position now since the shortened methylation call will be fine, see below
1901 }
1902
1903 my $end_read_1;
1904 my $start_read_2;
1905
1906 if ($strand eq '+') {
1907
1908 $end_read_1 = $start_read_1 + length($meth_call_1) - 1;
1909 $start_read_2 = $end_read_2 - length($meth_call_2) + 1;
1910
1911 ## we first pass the first read which is in + orientation on the forward strand
1912 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity
1913
1914 # we next pass the second read which is in - orientation on the reverse strand
1915 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
1916 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2);
1917 }
1918 else {
1919
1920 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
1921 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
1922
1923 ## we first pass the first read which is in - orientation on the reverse strand
1924 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1);
1925
1926 # we next pass the second read which is in + orientation on the forward strand
1927 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
1928 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2);
1929 }
1930
1931 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
1932 }
1933 }
1934 }
1935 else { # Bismark paired-end BAM/SAM output format (default)
1936 while (<IN>) {
1937 chomp;
1938 ### SAM format can either start with header lines (starting with @) or start with alignments directly
1939 if (/^\@/) { # skipping header lines (starting with @)
1940 warn "skipping SAM header line:\t$_\n" unless ($process_id == 0); # no additional warnings for child procesess
1941 next;
1942 }
1943
1944 ++$line_count;
1945 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
1946
1947 if ( ($line_count - $offset)%$multicore == 0){
1948 # warn "line count: $line_count\noffset: $offset\n";
1949 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
1950 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
1951 }
1952 else{
1953 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
1954 $_ = <IN>; # reading and skipping another line since this is the paired-end read
1955 next;
1956 }
1957
1958 # example paired-end reads in SAM format (2 consecutive lines)
1959 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
1960 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
1961
1962 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
1963 my $meth_call_1;
1964 my $first_read_conversion;
1965 my $genome_conversion;
1966
1967 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
1968 my $tag = $1;
1969 my $value = $2;
1970
1971 if ($tag eq "XM") {
1972 $meth_call_1 = $value;
1973 $meth_call_1 =~ s/\r//;
1974 } elsif ($tag eq "XR") {
1975 $first_read_conversion = $value;
1976 $first_read_conversion =~ s/\r//;
1977 } elsif ($tag eq "XG") {
1978 $genome_conversion = $value;
1979 $genome_conversion =~ s/\r//;
1980 }
1981 }
1982
1983 $_ = <IN>; # reading in the paired read
1984 chomp;
1985
1986 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
1987
1988 my $meth_call_2;
1989 my $second_read_conversion;
1990
1991 while ( /(XM|XR):Z:([^\t]+)/g ) {
1992 my $tag = $1;
1993 my $value = $2;
1994
1995 if ($tag eq "XM") {
1996 $meth_call_2 = $value;
1997 $meth_call_2 =~ s/\r//;
1998 } elsif ($tag eq "XR") {
1999 $second_read_conversion = $value;
2000 $second_read_conversion = s/\r//;
2001 }
2002 }
2003
2004 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
2005 # chomp $genome_conversion; # in case it captured a new line character # already removed
2006
2007 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
2008
2009 my $index;
2010 my $strand;
2011
2012 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
2013 $index = 0; ## this is OT
2014 $strand = '+';
2015 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
2016 $index = 1; ## this is CTOT
2017 $strand = '-';
2018 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
2019 $index = 2; ## this is CTOB
2020 $strand = '+';
2021 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
2022 $index = 3; ## this is OB
2023 $strand = '-';
2024 } else {
2025 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
2026 }
2027
2028 ### reversing the methylation call of the read that was reverse-complemented
2029 if ($strand eq '+') {
2030 $meth_call_2 = reverse $meth_call_2;
2031 } else {
2032 $meth_call_1 = reverse $meth_call_1;
2033 }
2034 # warn "\n$meth_call_1\n$meth_call_2\n";
2035
2036 if ($meth_call_1 and $meth_call_2) {
2037
2038 my $end_read_1;
2039
2040 ### READ 1
2041 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
2042 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
2043 shift @ops_1; # remove the empty first element
2044
2045 die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1);
2046
2047 my @comp_cigar_1; # building an array with all CIGAR operations
2048 foreach my $index (0..$#len_1) {
2049 foreach (1..$len_1[$index]) {
2050 # print "$ops_1[$index]";
2051 push @comp_cigar_1, $ops_1[$index];
2052 }
2053 }
2054 # print "original CIGAR read 1: $cigar_1\n";
2055 # print "original CIGAR read 1: @comp_cigar_1\n";
2056
2057 ### READ 2
2058 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
2059 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
2060 shift @ops_2; # remove the empty first element
2061 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
2062 my @comp_cigar_2; # building an array with all CIGAR operations for read 2
2063 foreach my $index (0..$#len_2) {
2064 foreach (1..$len_2[$index]) {
2065 # print "$ops_2[$index]";
2066 push @comp_cigar_2, $ops_2[$index];
2067 }
2068 }
2069 # print "original CIGAR read 2: $cigar_2\n";
2070 # print "original CIGAR read 2: @comp_cigar_2\n";
2071
2072 ##################################
2073 ### IGNORE BASES FROM 5' END ###
2074 ##################################
2075
2076 if ($ignore) {
2077 ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore <int>' for read 1
2078 ### the methylation calls have already been reversed where necessary
2079
2080 if ( (length($meth_call_1) - $ignore) <= 0){
2081 # next; # skipping this read entirely if the read is shorter than the portion to be ignored
2082 $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored
2083 }
2084 else {
2085 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
2086
2087 if ($strand eq '+') {
2088
2089 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
2090 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
2091 my $I_count_1 = 0;
2092
2093 for (1..$ignore) {
2094 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
2095 # print "$_ deleted $op\n";
2096
2097 while ($op eq 'D') { # repeating this for deletions (D)
2098 $D_count_1++;
2099 $op = shift @comp_cigar_1;
2100 # print "$_ deleted $op\n";
2101 }
2102 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
2103 $I_count_1++;
2104 }
2105 }
2106
2107 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
2108 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
2109
2110 # the start position of reads mapping to the reverse strand is being adjusted further below
2111 }
2112 elsif ($strand eq '-') {
2113
2114 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
2115 for (1..$ignore) {
2116 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
2117 while ($op eq 'D') { # repeating this for deletions (D)
2118 $op = pop @comp_cigar_1;
2119 }
2120 }
2121 # the start position of reads mapping to the reverse strand is being adjusted further below
2122
2123 }
2124 }
2125 }
2126
2127 if ($ignore_r2) {
2128 ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore_r2 <int>' for read 2
2129 ### the methylation calls have already been reversed where necessary
2130
2131 if ( (length($meth_call_2) - $ignore_r2) <= 0){
2132 # next; # skipping this read entirely if the read is shorter than the portion to be ignored # this would skip the entire read pair!
2133 $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored
2134 }
2135 else {
2136 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
2137
2138 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
2139
2140 if ($strand eq '+') {
2141
2142 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
2143
2144 for (1..$ignore_r2) {
2145 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
2146 while ($op eq 'D') { # repeating this for deletions (D)
2147 $op = pop @comp_cigar_2;
2148 }
2149 }
2150 # the start position of reads mapping to the reverse strand is being adjusted further below
2151 }
2152 elsif ($strand eq '-') {
2153
2154 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
2155 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
2156 my $I_count_2 = 0;
2157
2158 for (1..$ignore_r2) {
2159 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
2160 # print "$_ deleted $op\n";
2161
2162 while ($op eq 'D') { # repeating this for deletions (D)
2163 $D_count_2++;
2164 $op = shift @comp_cigar_2;
2165 # print "$_ deleted $op\n";
2166 }
2167 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
2168 $I_count_2++;
2169 }
2170 }
2171
2172 $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2;
2173 # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
2174 }
2175 }
2176 }
2177
2178 if ($ignore and $meth_call_1){ # if the methylation call string is undefined at this position we don't need any new CIGAR string
2179
2180 ### reconstituting shortened CIGAR string 1
2181 my $new_cigar_1;
2182 my $count_1 = 0;
2183 my $last_op_1;
2184 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
2185 foreach my $op (@comp_cigar_1) {
2186 unless (defined $last_op_1){
2187 $last_op_1 = $op;
2188 ++$count_1;
2189 next;
2190 }
2191 if ($last_op_1 eq $op) {
2192 ++$count_1;
2193 } else {
2194 $new_cigar_1 .= "$count_1$last_op_1";
2195 $last_op_1 = $op;
2196 $count_1 = 1;
2197 }
2198 }
2199 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
2200 $cigar_1 = $new_cigar_1;
2201 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
2202 }
2203
2204 if ($ignore_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string
2205
2206 ### reconstituting shortened CIGAR string 2
2207 my $new_cigar_2;
2208 my $count_2 = 0;
2209 my $last_op_2;
2210 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
2211 foreach my $op (@comp_cigar_2) {
2212 unless (defined $last_op_2){
2213 $last_op_2 = $op;
2214 ++$count_2;
2215 next;
2216 }
2217 if ($last_op_2 eq $op) {
2218 ++$count_2;
2219 }
2220 else {
2221 $new_cigar_2 .= "$count_2$last_op_2";
2222 $last_op_2 = $op;
2223 $count_2 = 1;
2224 }
2225 }
2226 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
2227 $cigar_2 = $new_cigar_2;
2228 # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n";
2229 }
2230
2231 ###########################
2232 ### END IGNORE 5' END ###
2233 ###########################
2234
2235 ##################################
2236 ### IGNORE BASES FROM 3' END ###
2237 ##################################
2238
2239 # print "CIGAR string before truncating 3' end (Read 1)\n";
2240 # print join ("",@comp_cigar_1),"\n";
2241
2242 if ($ignore_3prime and $meth_call_1) { # if the methylation call string is undefined at this point we don't need to process the read any further
2243
2244 ### Clipping off the last <int> number of bases from the methylation call string as specified with '--ignore_3prime <int>' for read 1
2245 ### the methylation calls have already been reversed where necessary
2246
2247 if ( (length($meth_call_1) - $ignore_3prime) <= 0){
2248 $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored
2249 }
2250 else {
2251 $meth_call_1 = substr($meth_call_1,0,length($meth_call_1) - $ignore_3prime);
2252 # warn "truncated meth_call 1:\n$meth_call_1\n";
2253
2254 if ($strand eq '+') {
2255
2256 ### if the (read 1) strand information is '+', clipping the 3' end does not affect the starting position of forward strand alignments
2257 # ignore 5' has already been taken care of at this stage, if relevant at all
2258
2259 for (1..$ignore_3prime) {
2260 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore_3prime operations from the end
2261 # print "$_ deleted $op from 3' end\n";
2262 # print join ("",@comp_cigar_1),"\n";
2263
2264 while ($op eq 'D') { # repeating this for deletions (D)
2265 $op = pop @comp_cigar_1;
2266 # print join ("",@comp_cigar_1),"\n";
2267 # print "$_ deleted $op from 3' end\n";
2268 }
2269 }
2270
2271 # print "Final truncated CIGAR string (Read 1):\n";
2272 # print join ("",@comp_cigar_1),"\n";
2273
2274 }
2275 elsif ($strand eq '-') {
2276
2277 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
2278 my $I_count_1 = 0;
2279
2280 ### if the (read 1) strand information is '-', the read 1 CIGAR string needs to be trimmed from the start
2281 for (1..$ignore_3prime) {
2282 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array
2283 # print join ("",@comp_cigar_1),"\n";
2284
2285 while ($op eq 'D') { # repeating this for deletions (D)
2286 $D_count_1++;
2287 $op = shift @comp_cigar_1;
2288 # print join ("",@comp_cigar_1),"\n";
2289 }
2290
2291 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
2292 $I_count_1++;
2293 }
2294 }
2295
2296 # print "Final truncated CIGAR string reverse_read:\n";
2297 # print join ("",@comp_cigar_1),"\n";
2298
2299 # Here we need to add the length ignore_3prime to the read starting position
2300 # adjustment of the true start position of this reverse read will take place later in the methylation extraction step
2301 $start_read_1 += $ignore_3prime + $D_count_1 - $I_count_1;
2302
2303 }
2304 }
2305 }
2306
2307 if ($ignore_3prime_r2 and $meth_call_2) { # if the methylation call string is undefined at this point we don't need to process the read any further
2308
2309 ### Clipping off the last <int> number of bases from the methylation call string as specified with '--ignore_3prime_r2 <int>' for read 2
2310 ### the methylation calls have already been reversed where necessary
2311
2312 if ( (length($meth_call_2) - $ignore_3prime_r2) <= 0){
2313 $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored
2314 }
2315 else {
2316 $meth_call_2 = substr($meth_call_2,0,length($meth_call_2) - $ignore_3prime_r2);
2317 # warn "truncated meth_call 2:\n$meth_call_2\n";
2318
2319 ### If we are ignoring a part of the sequence we also need to adjust the cigar string and the positions accordingly
2320
2321 if ($strand eq '+') {
2322
2323 ### if the (read 1) strand information is '+', clipping the 3' end of read 2 does potentially affect the starting position of read 2 (reverse strand alignment)
2324 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the start
2325
2326 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
2327 my $I_count_2 = 0;
2328
2329 for (1..$ignore_3prime_r2) {
2330 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the first value of the array
2331 # print "$_ deleted $op from 3' end\n";
2332 # print join ("",@comp_cigar_2),"\n";
2333
2334 while ($op eq 'D') { # repeating this for deletions (D)
2335 $D_count_2++;
2336 $op = shift @comp_cigar_2;
2337 # print "$_ deleted $op from 3' end\n";
2338 # print join ("",@comp_cigar_2),"\n";
2339 }
2340
2341 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
2342 $I_count_2++;
2343 }
2344 }
2345
2346 # print "Final truncated CIGAR string 2 (+ alignment):\n";
2347 # print join ("",@comp_cigar_2),"\n";
2348
2349 # Here we need to add the length ignore_3prime_r2 to the read starting position
2350 # adjustment of the true start position of this reverse read will take place later in the methylation extraction step
2351 $start_read_2 += $ignore_3prime_r2 + $D_count_2 - $I_count_2;
2352
2353 # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
2354 }
2355 elsif ($strand eq '-') {
2356 ### if the (read 1) strand information is '-', clipping the 3' end of read 2 does not affect its starting position (forward strand alignment)
2357 ### ignore_r2 5' has already been taken care of at this stage, if relevant at all
2358
2359 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the end
2360
2361 for (1..$ignore_3prime_r2) {
2362 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
2363 # print "$_ deleted $op\n";
2364 # print join ("",@comp_cigar_2),"\n";
2365
2366 while ($op eq 'D') { # repeating this for deletions (D)
2367 $op = pop @comp_cigar_2;
2368 # print "$_ deleted $op\n";
2369 # print join ("",@comp_cigar_2),"\n";
2370 }
2371 }
2372
2373 # print "Final truncated CIGAR string 2 (- alignment):\n";
2374 # print join ("",@comp_cigar_2),"\n";
2375
2376 }
2377 }
2378 }
2379
2380 if ($ignore_3prime and $meth_call_1){ # if the methylation call string is undefined at this point we don't need any new CIGAR string
2381
2382 ### reconstituting shortened CIGAR string 1
2383 my $new_cigar_1;
2384 my $count_1 = 0;
2385 my $last_op_1;
2386 # print "ignore_3prime adjusted CIGAR 1: @comp_cigar_1\n";
2387 foreach my $op (@comp_cigar_1) {
2388 unless (defined $last_op_1){
2389 $last_op_1 = $op;
2390 ++$count_1;
2391 next;
2392 }
2393 if ($last_op_1 eq $op) {
2394 ++$count_1;
2395 }
2396 else {
2397 $new_cigar_1 .= "$count_1$last_op_1";
2398 $last_op_1 = $op;
2399 $count_1 = 1;
2400 }
2401 }
2402 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
2403 $cigar_1 = $new_cigar_1;
2404 # warn "ignore_3prime adjusted CIGAR 1 scalar: $cigar_1\n";
2405 }
2406
2407 if ($ignore_3prime_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string
2408
2409 ### reconstituting shortened CIGAR string 2
2410 my $new_cigar_2;
2411 my $count_2 = 0;
2412 my $last_op_2;
2413 # print "ignore_3prime_r2 adjusted CIGAR 2: @comp_cigar_2\n";
2414 foreach my $op (@comp_cigar_2) {
2415 unless (defined $last_op_2){
2416 $last_op_2 = $op;
2417 ++$count_2;
2418 next;
2419 }
2420 if ($last_op_2 eq $op) {
2421 ++$count_2;
2422 }
2423 else {
2424 $new_cigar_2 .= "$count_2$last_op_2";
2425 $last_op_2 = $op;
2426 $count_2 = 1;
2427 }
2428 }
2429 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
2430 $cigar_2 = $new_cigar_2;
2431 # warn "ignore_3prime_r2 adjusted CIGAR 2 scalar: $cigar_2\n";
2432 }
2433
2434 ###########################
2435 ### END IGNORE 3' END ###
2436 ###########################
2437
2438
2439 ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on
2440
2441 if ($strand eq '+') {
2442 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
2443 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2444 # print "reverse: @comp_cigar_2\n";
2445
2446 my $MD_count_1 = 0;
2447 foreach (@comp_cigar_1) {
2448 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
2449 }
2450
2451 my $MD_count_2 = 0;
2452 foreach (@comp_cigar_2) {
2453 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
2454 }
2455
2456 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
2457 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
2458 }
2459 else {
2460 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
2461
2462 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2463 # print "reverse: @comp_cigar_1\n";
2464
2465 my $MD_count_1 = 0;
2466 foreach (@comp_cigar_1) {
2467 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
2468 }
2469
2470 $end_read_1 = $start_read_1;
2471 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
2472 }
2473
2474 if ($strand eq '+') {
2475 ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity
2476 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1);
2477
2478 # we next pass the second read which is in - orientation on the reverse strand
2479 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
2480 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2);
2481 }
2482 else {
2483 ## we first pass the first read which is in - orientation on the reverse strand
2484 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1);
2485
2486 # we next pass the second read which is in + orientation on the forward strand
2487 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
2488 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2);
2489 }
2490
2491 $methylation_call_strings_processed += 2; # paired-end = 2 methylation call strings
2492 }
2493 }
2494 }
2495 } else {
2496 die "Single-end or paired-end reads not specified properly\n";
2497 }
2498
2499 $counting{sequences_count} = $line_count;
2500 $counting{methylation_call_strings} = $methylation_call_strings_processed;
2501
2502 if ($multicore == 1){
2503 print_splitting_report ();
2504 }
2505 elsif ($multicore > 1){
2506 print_splitting_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed);
2507 print_mbias_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed);
2508 }
2509
2510 return ($process_id,\@pids,$report_filename);
2511
2512 }
2513
2514
2515
2516 sub print_splitting_report{
2517
2518 ### Calculating methylation percentages if applicable
2519 warn "\nProcessed $counting{sequences_count} lines in total\n";
2520 warn "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n";
2521 if ($report) {
2522 print REPORT "\nProcessed $counting{sequences_count} lines in total\n";
2523 print REPORT "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n";
2524 }
2525
2526 my $percent_meCpG;
2527 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
2528 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
2529 }
2530
2531 my $percent_meCHG;
2532 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
2533 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
2534 }
2535
2536 my $percent_meCHH;
2537 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
2538 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
2539 }
2540
2541 my $percent_non_CpG_methylation;
2542 if ($merge_non_CpG){
2543 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
2544 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
2545 }
2546 }
2547
2548 if ($report){
2549 ### detailed information about Cs analysed
2550 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
2551
2552 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
2553 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
2554
2555 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
2556 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
2557 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
2558
2559 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
2560 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
2561 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
2562
2563 ### calculating methylated CpG percentage if applicable
2564 if ($percent_meCpG){
2565 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
2566 }
2567 else{
2568 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
2569 }
2570
2571 ### 2-Context Output
2572 if ($merge_non_CpG){
2573 if ($percent_non_CpG_methylation){
2574 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
2575 }
2576 else{
2577 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
2578 }
2579 }
2580
2581 ### 3 Context Output
2582 else{
2583 ### calculating methylated CHG percentage if applicable
2584 if ($percent_meCHG){
2585 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
2586 }
2587 else{
2588 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2589 }
2590
2591 ### calculating methylated CHH percentage if applicable
2592 if ($percent_meCHH){
2593 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
2594 }
2595 else{
2596 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
2597 }
2598 }
2599 }
2600
2601 ### detailed information about Cs analysed for on-screen report
2602 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
2603
2604 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
2605 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
2606
2607 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
2608 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
2609 warn"Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
2610
2611 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
2612 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
2613 warn"Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
2614
2615 ### printing methylated CpG percentage if applicable
2616 if ($percent_meCpG){
2617 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
2618 }
2619 else{
2620 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
2621 }
2622
2623 ### 2-Context Output
2624 if ($merge_non_CpG){
2625 if ($percent_non_CpG_methylation){
2626 warn "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
2627 }
2628 else{
2629 warn "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
2630 }
2631 }
2632
2633 ### 3-Context Output
2634 else{
2635 ### printing methylated CHG percentage if applicable
2636 if ($percent_meCHG){
2637 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
2638 }
2639 else{
2640 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2641 }
2642
2643 ### printing methylated CHH percentage if applicable
2644 if ($percent_meCHH){
2645 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
2646 }
2647 else{
2648 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
2649 }
2650 }
2651 }
2652
2653 ###
2654
2655 sub print_splitting_report_multicore{
2656
2657 my ($report_filename,$offset,$line_count,$meth_call_strings) = @_;
2658
2659 # warn "\$report_filename is $report_filename\n";
2660 my $special_report = $report_filename.".$offset";
2661
2662 open (SPECIAL_REPORT,'>',$special_report) or die $!;
2663 # warn "line count\t$line_count\n";
2664 # warn "meth call strings\t$meth_call_strings\n";
2665
2666 print SPECIAL_REPORT "line count\t$line_count\n";
2667 print SPECIAL_REPORT "meth call strings\t$meth_call_strings\n";
2668
2669 ### Calculating methylation percentages if applicable
2670 my $percent_meCpG;
2671 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
2672 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
2673 }
2674
2675 my $percent_meCHG;
2676 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
2677 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
2678 }
2679
2680 my $percent_meCHH;
2681 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
2682 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
2683 }
2684
2685 my $percent_non_CpG_methylation;
2686 if ($merge_non_CpG){
2687 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
2688 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
2689 }
2690 }
2691
2692 if ($report){
2693
2694 ### detailed information about Cs analysed
2695 print SPECIAL_REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
2696
2697 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
2698 print SPECIAL_REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
2699
2700 print SPECIAL_REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
2701 print SPECIAL_REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
2702 print SPECIAL_REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
2703
2704 print SPECIAL_REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
2705 print SPECIAL_REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
2706 print SPECIAL_REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
2707
2708 ### calculating methylated CpG percentage if applicable
2709 if ($percent_meCpG){
2710 print SPECIAL_REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
2711 }
2712 else{
2713 print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
2714 }
2715
2716 ### 2-Context Output
2717 if ($merge_non_CpG){
2718 if ($percent_non_CpG_methylation){
2719 print SPECIAL_REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
2720 }
2721 else{
2722 print SPECIAL_REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
2723 }
2724 }
2725
2726 ### 3 Context Output
2727 else{
2728 ### calculating methylated CHG percentage if applicable
2729 if ($percent_meCHG){
2730 print SPECIAL_REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
2731 }
2732 else{
2733 print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2734 }
2735
2736 ### calculating methylated CHH percentage if applicable
2737 if ($percent_meCHH){
2738 print SPECIAL_REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
2739 }
2740 else{
2741 print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
2742 }
2743 }
2744 }
2745 close SPECIAL_REPORT or warn "Failed to close filehandle for individual report $special_report\n";
2746 }
2747
2748
2749 ###
2750
2751 ### INDIVIDUAL M-BIAS REPORTS
2752
2753 sub print_mbias_report_multicore{
2754
2755 my ($report_filename,$offset,$line_count,$meth_call_strings) = @_;
2756
2757 # warn "\$report_filename is $report_filename\n";
2758 my $special_mbias_report = $report_filename.".${offset}.mbias";
2759
2760 open (SPECIAL_MBIAS,'>',$special_mbias_report) or die $!;
2761
2762 # determining maximum read length
2763 my $max_length_1 = 0;
2764 my $max_length_2 = 0;
2765
2766 foreach my $context (keys %mbias_1){
2767 foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){
2768 $max_length_1 = $pos unless ($max_length_1 >= $pos);
2769 }
2770 }
2771 if ($paired){
2772 foreach my $context (keys %mbias_2){
2773 foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){
2774 $max_length_2 = $pos unless ($max_length_2 >= $pos);
2775 }
2776 }
2777 }
2778
2779 if ($single){
2780 # warn "Determining maximum read length for M-Bias plot\n";
2781 # warn "Maximum read length of Read 1: $max_length_1\n\n";
2782 }
2783 else{
2784 # warn "Determining maximum read lengths for M-Bias plots\n";
2785 # warn "Maximum read length of Read 1: $max_length_1\n";
2786 # warn "Maximum read length of Read 2: $max_length_2\n\n";
2787 }
2788
2789 foreach my $context (qw(CpG CHG CHH)){
2790
2791 if ($paired){
2792 print SPECIAL_MBIAS "$context context (R1)\n================\n";
2793 }
2794 else{
2795 print SPECIAL_MBIAS "$context context\n===========\n";
2796 }
2797 print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
2798
2799 foreach my $pos (1..$max_length_1){
2800
2801 unless (defined $mbias_1{$context}->{$pos}->{meth}){
2802 $mbias_1{$context}->{$pos}->{meth} = 0;
2803 }
2804 unless (defined $mbias_1{$context}->{$pos}->{un}){
2805 $mbias_1{$context}->{$pos}->{un} = 0;
2806 }
2807
2808 my $percent = '';
2809 if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){
2810 $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) );
2811 }
2812 my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth};
2813
2814 print SPECIAL_MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n";
2815 }
2816 print SPECIAL_MBIAS "\n";
2817 }
2818
2819 if ($paired){
2820
2821 foreach my $context (qw(CpG CHG CHH)){
2822
2823 print SPECIAL_MBIAS "$context context (R2)\n================\n";
2824 print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
2825
2826 foreach my $pos (1..$max_length_2){
2827
2828 unless (defined $mbias_2{$context}->{$pos}->{meth}){
2829 $mbias_2{$context}->{$pos}->{meth} = 0;
2830 }
2831 unless (defined $mbias_2{$context}->{$pos}->{un}){
2832 $mbias_2{$context}->{$pos}->{un} = 0;
2833 }
2834
2835 my $percent = '';
2836 if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){
2837 $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) );
2838 }
2839 my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth};
2840
2841 print SPECIAL_MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n";
2842 }
2843 }
2844 }
2845
2846 close SPECIAL_MBIAS or warn "Failed to close filehandle for individual M-bias report $special_mbias_report\n";
2847 }
2848
2849
2850 ###
2851
2852
2853 sub print_individual_C_methylation_states_paired_end_files{
2854
2855 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_;
2856
2857 unless (defined $meth_call) {
2858 return; # skip this read
2859 }
2860
2861 ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2
2862 die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2);
2863
2864 my @methylation_calls = split(//,$meth_call);
2865
2866 #################################################################
2867 ### . for bases not involving cytosines ###
2868 ### X for methylated C in CHG context (was protected) ###
2869 ### x for not methylated C in CHG context (was converted) ###
2870 ### H for methylated C in CHH context (was protected) ###
2871 ### h for not methylated C in CHH context (was converted) ###
2872 ### Z for methylated C in CpG context (was protected) ###
2873 ### z for not methylated C in CpG context (was converted) ###
2874 ### U for methylated C in Unknown context (was protected) ###
2875 ### u for not methylated C in Unknown context (was converted) ###
2876 #################################################################
2877
2878 my $methyl_CHG_count = 0;
2879 my $methyl_CHH_count = 0;
2880 my $methyl_CpG_count = 0;
2881 my $unmethylated_CHG_count = 0;
2882 my $unmethylated_CHH_count = 0;
2883 my $unmethylated_CpG_count = 0;
2884
2885 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
2886 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
2887 my @comp_cigar;
2888
2889 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
2890 if ($cigar =~ /^\d+M$/){
2891 # this check speeds up the extraction process by up to 60%!!!
2892 }
2893 else{ # parsing CIGAR string
2894 my @len;
2895 my @ops;
2896 @len = split (/\D+/,$cigar); # storing the length per operation
2897 @ops = split (/\d+/,$cigar); # storing the operation
2898 shift @ops; # remove the empty first element
2899
2900 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
2901
2902 foreach my $index (0..$#len){
2903 foreach (1..$len[$index]){
2904 # print "$ops[$index]";
2905 push @comp_cigar, $ops[$index];
2906 }
2907 }
2908 # warn "\nDetected CIGAR string: $cigar\n";
2909 # warn "Length of methylation call: ",length $meth_call,"\n";
2910 # warn "number of operations: ",scalar @ops,"\n";
2911 # warn "number of length digits: ",scalar @len,"\n\n";
2912 # print @comp_cigar,"\n";
2913 # print "$meth_call\n\n";
2914 # sleep (1);
2915 }
2916
2917 if ($strand eq '-') {
2918
2919 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
2920 if (@comp_cigar){
2921 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2922 }
2923 # print "reverse CIGAR string: @comp_cigar\n";
2924
2925 ### the start position of paired-end files has already been corrected, see above
2926 }
2927
2928 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
2929
2930 if ($merge_non_CpG) {
2931 if ($no_overlap) { # this has to be read 2...
2932
2933 ### single-file CpG and non-CpG context output
2934 if ($full) {
2935 if ($strand eq '+') {
2936 for my $index (0..$#methylation_calls) {
2937
2938 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2939 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2940 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2941 $cigar_offset += $cigar_mod;
2942 $pos_offset += $pos_mod;
2943 }
2944
2945 ### Returning as soon as the methylation calls start overlapping
2946 if ($start+$index+$pos_offset >= $end_read_1) {
2947 return;
2948 }
2949
2950 if ($methylation_calls[$index] eq 'X') {
2951 $counting{total_meCHG_count}++;
2952 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
2953 if ($read_identity == 1){
2954 $mbias_1{CHG}->{$index+1}->{meth}++;
2955 }
2956 else{
2957 $mbias_2{CHG}->{$index+1}->{meth}++;
2958 }
2959 }
2960 elsif ($methylation_calls[$index] eq 'x') {
2961 $counting{total_unmethylated_CHG_count}++;
2962 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
2963 if ($read_identity == 1){
2964 $mbias_1{CHG}->{$index+1}->{un}++;
2965 }
2966 else{
2967 $mbias_2{CHG}->{$index+1}->{un}++;
2968 }
2969 }
2970 elsif ($methylation_calls[$index] eq 'Z') {
2971 $counting{total_meCpG_count}++;
2972 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
2973 if ($read_identity == 1){
2974 $mbias_1{CpG}->{$index+1}->{meth}++;
2975 }
2976 else{
2977 $mbias_2{CpG}->{$index+1}->{meth}++;
2978 }
2979 }
2980 elsif ($methylation_calls[$index] eq 'z') {
2981 $counting{total_unmethylated_CpG_count}++;
2982 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
2983 if ($read_identity == 1){
2984 $mbias_1{CpG}->{$index+1}->{un}++;
2985 }
2986 else{
2987 $mbias_2{CpG}->{$index+1}->{un}++;
2988 }
2989 }
2990 elsif ($methylation_calls[$index] eq 'H') {
2991 $counting{total_meCHH_count}++;
2992 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
2993 if ($read_identity == 1){
2994 $mbias_1{CHH}->{$index+1}->{meth}++;
2995 }
2996 else{
2997 $mbias_2{CHH}->{$index+1}->{meth}++;
2998 }
2999 }
3000 elsif ($methylation_calls[$index] eq 'h') {
3001 $counting{total_unmethylated_CHH_count}++;
3002 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3003 if ($read_identity == 1){
3004 $mbias_1{CHH}->{$index+1}->{un}++;
3005 }
3006 else{
3007 $mbias_2{CHH}->{$index+1}->{un}++;
3008 }
3009 }
3010 elsif ($methylation_calls[$index] eq '.'){}
3011 elsif (lc$methylation_calls[$index] eq 'u'){}
3012 else{
3013 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
3014 }
3015 }
3016 }
3017 elsif ($strand eq '-') {
3018 for my $index (0..$#methylation_calls) {
3019
3020 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3021 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3022 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3023 $cigar_offset += $cigar_mod;
3024 $pos_offset += $pos_mod;
3025 }
3026
3027 ### Returning as soon as the methylation calls start overlapping
3028 if ($start-$index+$pos_offset <= $end_read_1) {
3029 return;
3030 }
3031
3032 if ($methylation_calls[$index] eq 'X') {
3033 $counting{total_meCHG_count}++;
3034 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3035 if ($read_identity == 1){
3036 $mbias_1{CHG}->{$index+1}->{meth}++;
3037 }
3038 else{
3039 $mbias_2{CHG}->{$index+1}->{meth}++;
3040 }
3041 }
3042 elsif ($methylation_calls[$index] eq 'x') {
3043 $counting{total_unmethylated_CHG_count}++;
3044 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3045 if ($read_identity == 1){
3046 $mbias_1{CHG}->{$index+1}->{un}++;
3047 }
3048 else{
3049 $mbias_2{CHG}->{$index+1}->{un}++;
3050 }
3051 }
3052 elsif ($methylation_calls[$index] eq 'Z') {
3053 $counting{total_meCpG_count}++;
3054 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3055 if ($read_identity == 1){
3056 $mbias_1{CpG}->{$index+1}->{meth}++;
3057 }
3058 else{
3059 $mbias_2{CpG}->{$index+1}->{meth}++;
3060 }
3061 }
3062 elsif ($methylation_calls[$index] eq 'z') {
3063 $counting{total_unmethylated_CpG_count}++;
3064 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3065 if ($read_identity == 1){
3066 $mbias_1{CpG}->{$index+1}->{un}++;
3067 }
3068 else{
3069 $mbias_2{CpG}->{$index+1}->{un}++;
3070 }
3071 }
3072 elsif ($methylation_calls[$index] eq 'H') {
3073 $counting{total_meCHH_count}++;
3074 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3075 if ($read_identity == 1){
3076 $mbias_1{CHH}->{$index+1}->{meth}++;
3077 }
3078 else{
3079 $mbias_2{CHH}->{$index+1}->{meth}++;
3080 }
3081 }
3082 elsif ($methylation_calls[$index] eq 'h') {
3083 $counting{total_unmethylated_CHH_count}++;
3084 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3085 if ($read_identity == 1){
3086 $mbias_1{CHH}->{$index+1}->{un}++;
3087 }
3088 else{
3089 $mbias_2{CHH}->{$index+1}->{un}++;
3090 }
3091 }
3092 elsif ($methylation_calls[$index] eq '.') {}
3093 elsif (lc$methylation_calls[$index] eq 'u'){}
3094 else{
3095 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
3096 }
3097 }
3098 } else {
3099 die "The read orientation was neither + nor -: '$strand'\n";
3100 }
3101 }
3102
3103 ### strand-specific methylation output
3104 else {
3105 if ($strand eq '+') {
3106 for my $index (0..$#methylation_calls) {
3107
3108 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3109 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3110 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3111 $cigar_offset += $cigar_mod;
3112 $pos_offset += $pos_mod;
3113 }
3114
3115 ### Returning as soon as the methylation calls start overlapping
3116 if ($start+$index+$pos_offset >= $end_read_1) {
3117 return;
3118 }
3119
3120 if ($methylation_calls[$index] eq 'X') {
3121 $counting{total_meCHG_count}++;
3122 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3123 if ($read_identity == 1){
3124 $mbias_1{CHG}->{$index+1}->{meth}++;
3125 }
3126 else{
3127 $mbias_2{CHG}->{$index+1}->{meth}++;
3128 }
3129 }
3130 elsif ($methylation_calls[$index] eq 'x') {
3131 $counting{total_unmethylated_CHG_count}++;
3132 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3133 if ($read_identity == 1){
3134 $mbias_1{CHG}->{$index+1}->{un}++;
3135 }
3136 else{
3137 $mbias_2{CHG}->{$index+1}->{un}++;
3138 }
3139 }
3140 elsif ($methylation_calls[$index] eq 'Z') {
3141 $counting{total_meCpG_count}++;
3142 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3143 if ($read_identity == 1){
3144 $mbias_1{CpG}->{$index+1}->{meth}++;
3145 }
3146 else{
3147 $mbias_2{CpG}->{$index+1}->{meth}++;
3148 }
3149 }
3150 elsif ($methylation_calls[$index] eq 'z') {
3151 $counting{total_unmethylated_CpG_count}++;
3152 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3153 if ($read_identity == 1){
3154 $mbias_1{CpG}->{$index+1}->{un}++;
3155 }
3156 else{
3157 $mbias_2{CpG}->{$index+1}->{un}++;
3158 }
3159 }
3160 elsif ($methylation_calls[$index] eq 'H') {
3161 $counting{total_meCHH_count}++;
3162 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3163 if ($read_identity == 1){
3164 $mbias_1{CHH}->{$index+1}->{meth}++;
3165 }
3166 else{
3167 $mbias_2{CHH}->{$index+1}->{meth}++;
3168 }
3169 }
3170 elsif ($methylation_calls[$index] eq 'h') {
3171 $counting{total_unmethylated_CHH_count}++;
3172 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3173 if ($read_identity == 1){
3174 $mbias_1{CHH}->{$index+1}->{un}++;
3175 }
3176 else{
3177 $mbias_2{CHH}->{$index+1}->{un}++;
3178 }
3179 }
3180 elsif ($methylation_calls[$index] eq '.') {}
3181 elsif (lc$methylation_calls[$index] eq 'u'){}
3182 else{
3183 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3184 }
3185 }
3186 } elsif ($strand eq '-') {
3187 for my $index (0..$#methylation_calls) {
3188
3189 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3190 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3191 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3192 $cigar_offset += $cigar_mod;
3193 $pos_offset += $pos_mod;
3194 }
3195
3196 ### Returning as soon as the methylation calls start overlapping
3197 if ($start-$index+$pos_offset <= $end_read_1) {
3198 return;
3199 }
3200
3201 if ($methylation_calls[$index] eq 'X') {
3202 $counting{total_meCHG_count}++;
3203 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3204 if ($read_identity == 1){
3205 $mbias_1{CHG}->{$index+1}->{meth}++;
3206 }
3207 else{
3208 $mbias_2{CHG}->{$index+1}->{meth}++;
3209 }
3210 }
3211 elsif ($methylation_calls[$index] eq 'x') {
3212 $counting{total_unmethylated_CHG_count}++;
3213 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3214 if ($read_identity == 1){
3215 $mbias_1{CHG}->{$index+1}->{un}++;
3216 }
3217 else{
3218 $mbias_2{CHG}->{$index+1}->{un}++;
3219 }
3220 }
3221 elsif ($methylation_calls[$index] eq 'Z') {
3222 $counting{total_meCpG_count}++;
3223 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3224 if ($read_identity == 1){
3225 $mbias_1{CpG}->{$index+1}->{meth}++;
3226 }
3227 else{
3228 $mbias_2{CpG}->{$index+1}->{meth}++;
3229 }
3230 }
3231 elsif ($methylation_calls[$index] eq 'z') {
3232 $counting{total_unmethylated_CpG_count}++;
3233 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3234 if ($read_identity == 1){
3235 $mbias_1{CpG}->{$index+1}->{un}++;
3236 }
3237 else{
3238 $mbias_2{CpG}->{$index+1}->{un}++;
3239 }
3240 }
3241 elsif ($methylation_calls[$index] eq 'H') {
3242 $counting{total_meCHH_count}++;
3243 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3244 if ($read_identity == 1){
3245 $mbias_1{CHH}->{$index+1}->{meth}++;
3246 }
3247 else{
3248 $mbias_2{CHH}->{$index+1}->{meth}++;
3249 }
3250 }
3251 elsif ($methylation_calls[$index] eq 'h') {
3252 $counting{total_unmethylated_CHH_count}++;
3253 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3254 if ($read_identity == 1){
3255 $mbias_1{CHH}->{$index+1}->{un}++;
3256 }
3257 else{
3258 $mbias_2{CHH}->{$index+1}->{un}++;
3259 }
3260 }
3261 elsif ($methylation_calls[$index] eq '.') {}
3262 elsif (lc$methylation_calls[$index] eq 'u'){}
3263 else{
3264 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3265 }
3266 }
3267 } else {
3268 die "The strand orientation was neither + nor -: '$strand'/n";
3269 }
3270 }
3271 }
3272
3273 ### this is the default paired-end procedure allowing overlaps and using every single C position
3274 ### Still within the 2-CONTEXT ONLY optional section
3275 else {
3276 ### single-file CpG and non-CpG context output
3277 if ($full) {
3278 if ($strand eq '+') {
3279 for my $index (0..$#methylation_calls) {
3280
3281 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3282 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3283 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3284 $cigar_offset += $cigar_mod;
3285 $pos_offset += $pos_mod;
3286 }
3287
3288 if ($methylation_calls[$index] eq 'X') {
3289 $counting{total_meCHG_count}++;
3290 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3291 if ($read_identity == 1){
3292 $mbias_1{CHG}->{$index+1}->{meth}++;
3293 }
3294 else{
3295 $mbias_2{CHG}->{$index+1}->{meth}++;
3296 }
3297 }
3298 elsif ($methylation_calls[$index] eq 'x') {
3299 $counting{total_unmethylated_CHG_count}++;
3300 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3301 if ($read_identity == 1){
3302 $mbias_1{CHG}->{$index+1}->{un}++;
3303 }
3304 else{
3305 $mbias_2{CHG}->{$index+1}->{un}++;
3306 }
3307 }
3308 elsif ($methylation_calls[$index] eq 'Z') {
3309 $counting{total_meCpG_count}++;
3310 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3311 if ($read_identity == 1){
3312 $mbias_1{CpG}->{$index+1}->{meth}++;
3313 }
3314 else{
3315 $mbias_2{CpG}->{$index+1}->{meth}++;
3316 }
3317 }
3318 elsif ($methylation_calls[$index] eq 'z') {
3319 $counting{total_unmethylated_CpG_count}++;
3320 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3321 if ($read_identity == 1){
3322 $mbias_1{CpG}->{$index+1}->{un}++;
3323 }
3324 else{
3325 $mbias_2{CpG}->{$index+1}->{un}++;
3326 }
3327 }
3328 elsif ($methylation_calls[$index] eq 'H') {
3329 $counting{total_meCHH_count}++;
3330 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3331 if ($read_identity == 1){
3332 $mbias_1{CHH}->{$index+1}->{meth}++;
3333 }
3334 else{
3335 $mbias_2{CHH}->{$index+1}->{meth}++;
3336 }
3337 }
3338 elsif ($methylation_calls[$index] eq 'h') {
3339 $counting{total_unmethylated_CHH_count}++;
3340 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3341 if ($read_identity == 1){
3342 $mbias_1{CHH}->{$index+1}->{un}++;
3343 }
3344 else{
3345 $mbias_2{CHH}->{$index+1}->{un}++;
3346 }
3347 }
3348 elsif ($methylation_calls[$index] eq '.') {}
3349 elsif (lc$methylation_calls[$index] eq 'u'){}
3350 else{
3351 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
3352 }
3353 }
3354 } elsif ($strand eq '-') {
3355 for my $index (0..$#methylation_calls) {
3356
3357 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3358 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3359 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3360 $cigar_offset += $cigar_mod;
3361 $pos_offset += $pos_mod;
3362 }
3363
3364 if ($methylation_calls[$index] eq 'X') {
3365 $counting{total_meCHG_count}++;
3366 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3367 if ($read_identity == 1){
3368 $mbias_1{CHG}->{$index+1}->{meth}++;
3369 }
3370 else{
3371 $mbias_2{CHG}->{$index+1}->{meth}++;
3372 }
3373 }
3374 elsif ($methylation_calls[$index] eq 'x') {
3375 $counting{total_unmethylated_CHG_count}++;
3376 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3377 if ($read_identity == 1){
3378 $mbias_1{CHG}->{$index+1}->{un}++;
3379 }
3380 else{
3381 $mbias_2{CHG}->{$index+1}->{un}++;
3382 }
3383 }
3384 elsif ($methylation_calls[$index] eq 'Z') {
3385 $counting{total_meCpG_count}++;
3386 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3387 if ($read_identity == 1){
3388 $mbias_1{CpG}->{$index+1}->{meth}++;
3389 }
3390 else{
3391 $mbias_2{CpG}->{$index+1}->{meth}++;
3392 }
3393 }
3394 elsif ($methylation_calls[$index] eq 'z') {
3395 $counting{total_unmethylated_CpG_count}++;
3396 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3397 if ($read_identity == 1){
3398 $mbias_1{CpG}->{$index+1}->{un}++;
3399 }
3400 else{
3401 $mbias_2{CpG}->{$index+1}->{un}++;
3402 }
3403 }
3404 elsif ($methylation_calls[$index] eq 'H') {
3405 $counting{total_meCHH_count}++;
3406 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3407 if ($read_identity == 1){
3408 $mbias_1{CHH}->{$index+1}->{meth}++;
3409 }
3410 else{
3411 $mbias_2{CHH}->{$index+1}->{meth}++;
3412 }
3413 }
3414 elsif ($methylation_calls[$index] eq 'h') {
3415 $counting{total_unmethylated_CHH_count}++;
3416 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3417 if ($read_identity == 1){
3418 $mbias_1{CHH}->{$index+1}->{un}++;
3419 }
3420 else{
3421 $mbias_2{CHH}->{$index+1}->{un}++;
3422 }
3423 }
3424 elsif ($methylation_calls[$index] eq '.') {}
3425 elsif (lc$methylation_calls[$index] eq 'u'){}
3426 else{
3427 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
3428 }
3429 }
3430 } else {
3431 die "The strand orientation as neither + nor -: '$strand'\n";
3432 }
3433 }
3434
3435 ### strand-specific methylation output
3436 ### still within the 2-CONTEXT optional section
3437 else {
3438 if ($strand eq '+') {
3439 for my $index (0..$#methylation_calls) {
3440
3441 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3442 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3443 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3444 $cigar_offset += $cigar_mod;
3445 $pos_offset += $pos_mod;
3446 }
3447
3448 if ($methylation_calls[$index] eq 'X') {
3449 $counting{total_meCHG_count}++;
3450 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3451 if ($read_identity == 1){
3452 $mbias_1{CHG}->{$index+1}->{meth}++;
3453 }
3454 else{
3455 $mbias_2{CHG}->{$index+1}->{meth}++;
3456 }
3457 }
3458 elsif ($methylation_calls[$index] eq 'x') {
3459 $counting{total_unmethylated_CHG_count}++;
3460 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3461 if ($read_identity == 1){
3462 $mbias_1{CHG}->{$index+1}->{un}++;
3463 }
3464 else{
3465 $mbias_2{CHG}->{$index+1}->{un}++;
3466 }
3467 }
3468 elsif ($methylation_calls[$index] eq 'Z') {
3469 $counting{total_meCpG_count}++;
3470 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3471 if ($read_identity == 1){
3472 $mbias_1{CpG}->{$index+1}->{meth}++;
3473 }
3474 else{
3475 $mbias_2{CpG}->{$index+1}->{meth}++;
3476 }
3477 }
3478 elsif ($methylation_calls[$index] eq 'z') {
3479 $counting{total_unmethylated_CpG_count}++;
3480 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3481 if ($read_identity == 1){
3482 $mbias_1{CpG}->{$index+1}->{un}++;
3483 }
3484 else{
3485 $mbias_2{CpG}->{$index+1}->{un}++;
3486 }
3487 }
3488 elsif ($methylation_calls[$index] eq 'H') {
3489 $counting{total_meCHH_count}++;
3490 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3491 if ($read_identity == 1){
3492 $mbias_1{CHH}->{$index+1}->{meth}++;
3493 }
3494 else{
3495 $mbias_2{CHH}->{$index+1}->{meth}++;
3496 }
3497 }
3498 elsif ($methylation_calls[$index] eq 'h') {
3499 $counting{total_unmethylated_CHH_count}++;
3500 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3501 if ($read_identity == 1){
3502 $mbias_1{CHH}->{$index+1}->{un}++;
3503 }
3504 else{
3505 $mbias_2{CHH}->{$index+1}->{un}++;
3506 }
3507 }
3508 elsif ($methylation_calls[$index] eq '.') {}
3509 elsif (lc$methylation_calls[$index] eq 'u'){}
3510 else{
3511 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3512 }
3513 }
3514 } elsif ($strand eq '-') {
3515 for my $index (0..$#methylation_calls) {
3516
3517 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3518 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3519 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3520 $cigar_offset += $cigar_mod;
3521 $pos_offset += $pos_mod;
3522 }
3523
3524 if ($methylation_calls[$index] eq 'X') {
3525 $counting{total_meCHG_count}++;
3526 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3527 if ($read_identity == 1){
3528 $mbias_1{CHG}->{$index+1}->{meth}++;
3529 }
3530 else{
3531 $mbias_2{CHG}->{$index+1}->{meth}++;
3532 }
3533 }
3534 elsif ($methylation_calls[$index] eq 'x') {
3535 $counting{total_unmethylated_CHG_count}++;
3536 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3537 if ($read_identity == 1){
3538 $mbias_1{CHG}->{$index+1}->{un}++;
3539 }
3540 else{
3541 $mbias_2{CHG}->{$index+1}->{un}++;
3542 }
3543 }
3544 elsif ($methylation_calls[$index] eq 'Z') {
3545 $counting{total_meCpG_count}++;
3546 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3547 if ($read_identity == 1){
3548 $mbias_1{CpG}->{$index+1}->{meth}++;
3549 }
3550 else{
3551 $mbias_2{CpG}->{$index+1}->{meth}++;
3552 }
3553 }
3554 elsif ($methylation_calls[$index] eq 'z') {
3555 $counting{total_unmethylated_CpG_count}++;
3556 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3557 if ($read_identity == 1){
3558 $mbias_1{CpG}->{$index+1}->{un}++;
3559 }
3560 else{
3561 $mbias_2{CpG}->{$index+1}->{un}++;
3562 }
3563 }
3564 elsif ($methylation_calls[$index] eq 'H') {
3565 $counting{total_meCHH_count}++;
3566 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3567 if ($read_identity == 1){
3568 $mbias_1{CHH}->{$index+1}->{meth}++;
3569 }
3570 else{
3571 $mbias_2{CHH}->{$index+1}->{meth}++;
3572 }
3573 }
3574 elsif ($methylation_calls[$index] eq 'h') {
3575 $counting{total_unmethylated_CHH_count}++;
3576 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3577 if ($read_identity == 1){
3578 $mbias_1{CHH}->{$index+1}->{un}++;
3579 }
3580 else{
3581 $mbias_2{CHH}->{$index+1}->{un}++;
3582 }
3583 }
3584 elsif ($methylation_calls[$index] eq '.') {}
3585 elsif (lc$methylation_calls[$index] eq 'u'){}
3586 else{
3587 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3588 }
3589 }
3590 } else {
3591 die "The strand orientation as neither + nor -: '$strand'\n";
3592 }
3593 }
3594 }
3595 }
3596
3597 ############################################
3598 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
3599 ############################################
3600
3601 elsif ($no_overlap) {
3602 ### single-file CpG, CHG and CHH context output
3603 if ($full) {
3604 if ($strand eq '+') {
3605 for my $index (0..$#methylation_calls) {
3606
3607 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3608 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3609 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3610 $cigar_offset += $cigar_mod;
3611 $pos_offset += $pos_mod;
3612 }
3613
3614 ### Returning as soon as the methylation calls start overlapping
3615 if ($start+$index+$pos_offset >= $end_read_1) {
3616 return;
3617 }
3618
3619 if ($methylation_calls[$index] eq 'X') {
3620 $counting{total_meCHG_count}++;
3621 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3622 if ($read_identity == 1){
3623 $mbias_1{CHG}->{$index+1}->{meth}++;
3624 }
3625 else{
3626 $mbias_2{CHG}->{$index+1}->{meth}++;
3627 }
3628 }
3629 elsif ($methylation_calls[$index] eq 'x') {
3630 $counting{total_unmethylated_CHG_count}++;
3631 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3632 if ($read_identity == 1){
3633 $mbias_1{CHG}->{$index+1}->{un}++;
3634 }
3635 else{
3636 $mbias_2{CHG}->{$index+1}->{un}++;
3637 }
3638 }
3639 elsif ($methylation_calls[$index] eq 'Z') {
3640 $counting{total_meCpG_count}++;
3641 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3642 if ($read_identity == 1){
3643 $mbias_1{CpG}->{$index+1}->{meth}++;
3644 }
3645 else{
3646 $mbias_2{CpG}->{$index+1}->{meth}++;
3647 }
3648 }
3649 elsif ($methylation_calls[$index] eq 'z') {
3650 $counting{total_unmethylated_CpG_count}++;
3651 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3652 if ($read_identity == 1){
3653 $mbias_1{CpG}->{$index+1}->{un}++;
3654 }
3655 else{
3656 $mbias_2{CpG}->{$index+1}->{un}++;
3657 }
3658 }
3659 elsif ($methylation_calls[$index] eq 'H') {
3660 $counting{total_meCHH_count}++;
3661 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3662 if ($read_identity == 1){
3663 $mbias_1{CHH}->{$index+1}->{meth}++;
3664 }
3665 else{
3666 $mbias_2{CHH}->{$index+1}->{meth}++;
3667 }
3668 }
3669 elsif ($methylation_calls[$index] eq 'h') {
3670 $counting{total_unmethylated_CHH_count}++;
3671 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3672 if ($read_identity == 1){
3673 $mbias_1{CHH}->{$index+1}->{un}++;
3674 }
3675 else{
3676 $mbias_2{CHH}->{$index+1}->{un}++;
3677 }
3678 }
3679 elsif ($methylation_calls[$index] eq '.') {}
3680 elsif (lc$methylation_calls[$index] eq 'u'){}
3681 else{
3682 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3683 }
3684 }
3685 } elsif ($strand eq '-') {
3686 for my $index (0..$#methylation_calls) {
3687
3688 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3689 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3690 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3691 $cigar_offset += $cigar_mod;
3692 $pos_offset += $pos_mod;
3693 }
3694
3695 ### Returning as soon as the methylation calls start overlapping
3696 if ($start-$index+$pos_offset <= $end_read_1) {
3697 return;
3698 }
3699
3700 if ($methylation_calls[$index] eq 'X') {
3701 $counting{total_meCHG_count}++;
3702 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3703 if ($read_identity == 1){
3704 $mbias_1{CHG}->{$index+1}->{meth}++;
3705 }
3706 else{
3707 $mbias_2{CHG}->{$index+1}->{meth}++;
3708 }
3709 }
3710 elsif ($methylation_calls[$index] eq 'x') {
3711 $counting{total_unmethylated_CHG_count}++;
3712 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3713 if ($read_identity == 1){
3714 $mbias_1{CHG}->{$index+1}->{un}++;
3715 }
3716 else{
3717 $mbias_2{CHG}->{$index+1}->{un}++;
3718 }
3719 }
3720 elsif ($methylation_calls[$index] eq 'Z') {
3721 $counting{total_meCpG_count}++;
3722 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3723 if ($read_identity == 1){
3724 $mbias_1{CpG}->{$index+1}->{meth}++;
3725 }
3726 else{
3727 $mbias_2{CpG}->{$index+1}->{meth}++;
3728 }
3729 }
3730 elsif ($methylation_calls[$index] eq 'z') {
3731 $counting{total_unmethylated_CpG_count}++;
3732 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3733 if ($read_identity == 1){
3734 $mbias_1{CpG}->{$index+1}->{un}++;
3735 }
3736 else{
3737 $mbias_2{CpG}->{$index+1}->{un}++;
3738 }
3739 }
3740 elsif ($methylation_calls[$index] eq 'H') {
3741 $counting{total_meCHH_count}++;
3742 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3743 if ($read_identity == 1){
3744 $mbias_1{CHH}->{$index+1}->{meth}++;
3745 }
3746 else{
3747 $mbias_2{CHH}->{$index+1}->{meth}++;
3748 }
3749 }
3750 elsif ($methylation_calls[$index] eq 'h') {
3751 $counting{total_unmethylated_CHH_count}++;
3752 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3753 if ($read_identity == 1){
3754 $mbias_1{CHH}->{$index+1}->{un}++;
3755 }
3756 else{
3757 $mbias_2{CHH}->{$index+1}->{un}++;
3758 }
3759 }
3760 elsif ($methylation_calls[$index] eq '.') {}
3761 elsif (lc$methylation_calls[$index] eq 'u'){}
3762 else{
3763 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3764 }
3765 }
3766 } else {
3767 die "The strand orientation as neither + nor -: '$strand'\n";
3768 }
3769 }
3770
3771 ### strand-specific methylation output
3772 else {
3773 if ($strand eq '+') {
3774 for my $index (0..$#methylation_calls) {
3775
3776 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3777 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3778 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3779 $cigar_offset += $cigar_mod;
3780 $pos_offset += $pos_mod;
3781 }
3782
3783 ### Returning as soon as the methylation calls start overlapping
3784 if ($start+$index+$pos_offset >= $end_read_1) {
3785 return;
3786 }
3787
3788 if ($methylation_calls[$index] eq 'X') {
3789 $counting{total_meCHG_count}++;
3790 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3791 if ($read_identity == 1){
3792 $mbias_1{CHG}->{$index+1}->{meth}++;
3793 }
3794 else{
3795 $mbias_2{CHG}->{$index+1}->{meth}++;
3796 }
3797 }
3798 elsif ($methylation_calls[$index] eq 'x') {
3799 $counting{total_unmethylated_CHG_count}++;
3800 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3801 if ($read_identity == 1){
3802 $mbias_1{CHG}->{$index+1}->{un}++;
3803 }
3804 else{
3805 $mbias_2{CHG}->{$index+1}->{un}++;
3806 }
3807 }
3808 elsif ($methylation_calls[$index] eq 'Z') {
3809 $counting{total_meCpG_count}++;
3810 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3811 if ($read_identity == 1){
3812 $mbias_1{CpG}->{$index+1}->{meth}++;
3813 }
3814 else{
3815 $mbias_2{CpG}->{$index+1}->{meth}++;
3816 }
3817 }
3818 elsif ($methylation_calls[$index] eq 'z') {
3819 $counting{total_unmethylated_CpG_count}++;
3820 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3821 if ($read_identity == 1){
3822 $mbias_1{CpG}->{$index+1}->{un}++;
3823 }
3824 else{
3825 $mbias_2{CpG}->{$index+1}->{un}++;
3826 }
3827 }
3828 elsif ($methylation_calls[$index] eq 'H') {
3829 $counting{total_meCHH_count}++;
3830 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3831 if ($read_identity == 1){
3832 $mbias_1{CHH}->{$index+1}->{meth}++;
3833 }
3834 else{
3835 $mbias_2{CHH}->{$index+1}->{meth}++;
3836 }
3837 }
3838 elsif ($methylation_calls[$index] eq 'h') {
3839 $counting{total_unmethylated_CHH_count}++;
3840 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3841 if ($read_identity == 1){
3842 $mbias_1{CHH}->{$index+1}->{un}++;
3843 }
3844 else{
3845 $mbias_2{CHH}->{$index+1}->{un}++;
3846 }
3847 }
3848 elsif ($methylation_calls[$index] eq '.') {}
3849 elsif (lc$methylation_calls[$index] eq 'u'){}
3850 else{
3851 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3852 }
3853 }
3854 } elsif ($strand eq '-') {
3855 for my $index (0..$#methylation_calls) {
3856
3857 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3858 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3859 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3860 $cigar_offset += $cigar_mod;
3861 $pos_offset += $pos_mod;
3862 }
3863
3864 ### Returning as soon as the methylation calls start overlapping
3865 if ($start-$index+$pos_offset <= $end_read_1) {
3866 return;
3867 }
3868
3869 if ($methylation_calls[$index] eq 'X') {
3870 $counting{total_meCHG_count}++;
3871 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3872 if ($read_identity == 1){
3873 $mbias_1{CHG}->{$index+1}->{meth}++;
3874 }
3875 else{
3876 $mbias_2{CHG}->{$index+1}->{meth}++;
3877 }
3878 }
3879 elsif ($methylation_calls[$index] eq 'x') {
3880 $counting{total_unmethylated_CHG_count}++;
3881 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3882 if ($read_identity == 1){
3883 $mbias_1{CHG}->{$index+1}->{un}++;
3884 }
3885 else{
3886 $mbias_2{CHG}->{$index+1}->{un}++;
3887 }
3888 }
3889 elsif ($methylation_calls[$index] eq 'Z') {
3890 $counting{total_meCpG_count}++;
3891 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3892 if ($read_identity == 1){
3893 $mbias_1{CpG}->{$index+1}->{meth}++;
3894 }
3895 else{
3896 $mbias_2{CpG}->{$index+1}->{meth}++;
3897 }
3898 }
3899 elsif ($methylation_calls[$index] eq 'z') {
3900 $counting{total_unmethylated_CpG_count}++;
3901 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3902 if ($read_identity == 1){
3903 $mbias_1{CpG}->{$index+1}->{un}++;
3904 }
3905 else{
3906 $mbias_2{CpG}->{$index+1}->{un}++;
3907 }
3908 }
3909 elsif ($methylation_calls[$index] eq 'H') {
3910 $counting{total_meCHH_count}++;
3911 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3912 if ($read_identity == 1){
3913 $mbias_1{CHH}->{$index+1}->{meth}++;
3914 }
3915 else{
3916 $mbias_2{CHH}->{$index+1}->{meth}++;
3917 }
3918 }
3919 elsif ($methylation_calls[$index] eq 'h') {
3920 $counting{total_unmethylated_CHH_count}++;
3921 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3922 if ($read_identity == 1){
3923 $mbias_1{CHH}->{$index+1}->{un}++;
3924 }
3925 else{
3926 $mbias_2{CHH}->{$index+1}->{un}++;
3927 }
3928 }
3929 elsif ($methylation_calls[$index] eq '.') {}
3930 elsif (lc$methylation_calls[$index] eq 'u'){}
3931 else{
3932 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3933 }
3934 }
3935 } else {
3936 die "The strand orientation as neither + nor -: '$strand'\n";
3937 }
3938 }
3939 }
3940
3941 ### this is the paired-end procedure allowing overlaps and using every single C position
3942 else {
3943 ### single-file CpG, CHG and CHH context output
3944 if ($full) {
3945 if ($strand eq '+') {
3946 for my $index (0..$#methylation_calls) {
3947
3948 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3949 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3950 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3951 $cigar_offset += $cigar_mod;
3952 $pos_offset += $pos_mod;
3953 }
3954
3955 if ($methylation_calls[$index] eq 'X') {
3956 $counting{total_meCHG_count}++;
3957 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3958 if ($read_identity == 1){
3959 $mbias_1{CHG}->{$index+1}->{meth}++;
3960 }
3961 else{
3962 $mbias_2{CHG}->{$index+1}->{meth}++;
3963 }
3964 }
3965 elsif ($methylation_calls[$index] eq 'x') {
3966 $counting{total_unmethylated_CHG_count}++;
3967 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3968 if ($read_identity == 1){
3969 $mbias_1{CHG}->{$index+1}->{un}++;
3970 }
3971 else{
3972 $mbias_2{CHG}->{$index+1}->{un}++;
3973 }
3974 }
3975 elsif ($methylation_calls[$index] eq 'Z') {
3976 $counting{total_meCpG_count}++;
3977 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3978 if ($read_identity == 1){
3979 $mbias_1{CpG}->{$index+1}->{meth}++;
3980 }
3981 else{
3982 $mbias_2{CpG}->{$index+1}->{meth}++;
3983 }
3984 }
3985 elsif ($methylation_calls[$index] eq 'z') {
3986 $counting{total_unmethylated_CpG_count}++;
3987 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3988 if ($read_identity == 1){
3989 $mbias_1{CpG}->{$index+1}->{un}++;
3990 }
3991 else{
3992 $mbias_2{CpG}->{$index+1}->{un}++;
3993 }
3994 }
3995 elsif ($methylation_calls[$index] eq 'H') {
3996 $counting{total_meCHH_count}++;
3997 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
3998 if ($read_identity == 1){
3999 $mbias_1{CHH}->{$index+1}->{meth}++;
4000 }
4001 else{
4002 $mbias_2{CHH}->{$index+1}->{meth}++;
4003 }
4004 }
4005 elsif ($methylation_calls[$index] eq 'h') {
4006 $counting{total_unmethylated_CHH_count}++;
4007 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4008 if ($read_identity == 1){
4009 $mbias_1{CHH}->{$index+1}->{un}++;
4010 }
4011 else{
4012 $mbias_2{CHH}->{$index+1}->{un}++;
4013 }
4014 }
4015 elsif ($methylation_calls[$index] eq '.') {}
4016 elsif (lc$methylation_calls[$index] eq 'u'){}
4017 else{
4018 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4019 }
4020 }
4021 } elsif ($strand eq '-') {
4022 for my $index (0..$#methylation_calls) {
4023
4024 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
4025 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
4026 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4027 $cigar_offset += $cigar_mod;
4028 $pos_offset += $pos_mod;
4029 }
4030
4031 if ($methylation_calls[$index] eq 'X') {
4032 $counting{total_meCHG_count}++;
4033 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4034 if ($read_identity == 1){
4035 $mbias_1{CHG}->{$index+1}->{meth}++;
4036 }
4037 else{
4038 $mbias_2{CHG}->{$index+1}->{meth}++;
4039 }
4040 }
4041 elsif ($methylation_calls[$index] eq 'x') {
4042 $counting{total_unmethylated_CHG_count}++;
4043 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4044 if ($read_identity == 1){
4045 $mbias_1{CHG}->{$index+1}->{un}++;
4046 }
4047 else{
4048 $mbias_2{CHG}->{$index+1}->{un}++;
4049 }
4050 }
4051 elsif ($methylation_calls[$index] eq 'Z') {
4052 $counting{total_meCpG_count}++;
4053 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4054 if ($read_identity == 1){
4055 $mbias_1{CpG}->{$index+1}->{meth}++;
4056 }
4057 else{
4058 $mbias_2{CpG}->{$index+1}->{meth}++;
4059 }
4060 }
4061 elsif ($methylation_calls[$index] eq 'z') {
4062 $counting{total_unmethylated_CpG_count}++;
4063 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4064 if ($read_identity == 1){
4065 $mbias_1{CpG}->{$index+1}->{un}++;
4066 }
4067 else{
4068 $mbias_2{CpG}->{$index+1}->{un}++;
4069 }
4070 }
4071 elsif ($methylation_calls[$index] eq 'H') {
4072 $counting{total_meCHH_count}++;
4073 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4074 if ($read_identity == 1){
4075 $mbias_1{CHH}->{$index+1}->{meth}++;
4076 }
4077 else{
4078 $mbias_2{CHH}->{$index+1}->{meth}++;
4079 }
4080 }
4081 elsif ($methylation_calls[$index] eq 'h') {
4082 $counting{total_unmethylated_CHH_count}++;
4083 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4084 if ($read_identity == 1){
4085 $mbias_1{CHH}->{$index+1}->{un}++;
4086 }
4087 else{
4088 $mbias_2{CHH}->{$index+1}->{un}++;
4089 }
4090 }
4091 elsif ($methylation_calls[$index] eq '.') {}
4092 elsif (lc$methylation_calls[$index] eq 'u'){}
4093 else{
4094 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4095 }
4096 }
4097 } else {
4098 die "The strand orientation as neither + nor -: '$strand'\n";
4099 }
4100 }
4101
4102 ### strand-specific methylation output
4103 else {
4104 if ($strand eq '+') {
4105 for my $index (0..$#methylation_calls) {
4106
4107 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
4108 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4109 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
4110 $cigar_offset += $cigar_mod;
4111 $pos_offset += $pos_mod;
4112 }
4113
4114 if ($methylation_calls[$index] eq 'X') {
4115 $counting{total_meCHG_count}++;
4116 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4117 if ($read_identity == 1){
4118 $mbias_1{CHG}->{$index+1}->{meth}++;
4119 }
4120 else{
4121 $mbias_2{CHG}->{$index+1}->{meth}++;
4122 }
4123 }
4124 elsif ($methylation_calls[$index] eq 'x') {
4125 $counting{total_unmethylated_CHG_count}++;
4126 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4127 if ($read_identity == 1){
4128 $mbias_1{CHG}->{$index+1}->{un}++;
4129 }
4130 else{
4131 $mbias_2{CHG}->{$index+1}->{un}++;
4132 }
4133 }
4134 elsif ($methylation_calls[$index] eq 'Z') {
4135 $counting{total_meCpG_count}++;
4136 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4137 if ($read_identity == 1){
4138 $mbias_1{CpG}->{$index+1}->{meth}++;
4139 }
4140 else{
4141 $mbias_2{CpG}->{$index+1}->{meth}++;
4142 }
4143 }
4144 elsif ($methylation_calls[$index] eq 'z') {
4145 $counting{total_unmethylated_CpG_count}++;
4146 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4147 if ($read_identity == 1){
4148 $mbias_1{CpG}->{$index+1}->{un}++;
4149 }
4150 else{
4151 $mbias_2{CpG}->{$index+1}->{un}++;
4152 }
4153 }
4154 elsif ($methylation_calls[$index] eq 'H') {
4155 $counting{total_meCHH_count}++;
4156 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4157 if ($read_identity == 1){
4158 $mbias_1{CHH}->{$index+1}->{meth}++;
4159 }
4160 else{
4161 $mbias_2{CHH}->{$index+1}->{meth}++;
4162 }
4163 }
4164 elsif ($methylation_calls[$index] eq 'h') {
4165 $counting{total_unmethylated_CHH_count}++;
4166 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4167 if ($read_identity == 1){
4168 $mbias_1{CHH}->{$index+1}->{un}++;
4169 }
4170 else{
4171 $mbias_2{CHH}->{$index+1}->{un}++;
4172 }
4173 }
4174 elsif ($methylation_calls[$index] eq '.') {}
4175 elsif (lc$methylation_calls[$index] eq 'u'){}
4176 else{
4177 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4178 }
4179 }
4180 } elsif ($strand eq '-') {
4181 for my $index (0..$#methylation_calls) {
4182
4183 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
4184 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
4185 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4186 $cigar_offset += $cigar_mod;
4187 $pos_offset += $pos_mod;
4188 }
4189
4190 if ($methylation_calls[$index] eq 'X') {
4191 $counting{total_meCHG_count}++;
4192 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4193 if ($read_identity == 1){
4194 $mbias_1{CHG}->{$index+1}->{meth}++;
4195 }
4196 else{
4197 $mbias_2{CHG}->{$index+1}->{meth}++;
4198 }
4199 }
4200 elsif ($methylation_calls[$index] eq 'x') {
4201 $counting{total_unmethylated_CHG_count}++;
4202 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4203 if ($read_identity == 1){
4204 $mbias_1{CHG}->{$index+1}->{un}++;
4205 }
4206 else{
4207 $mbias_2{CHG}->{$index+1}->{un}++;
4208 }
4209 }
4210 elsif ($methylation_calls[$index] eq 'Z') {
4211 $counting{total_meCpG_count}++;
4212 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4213 if ($read_identity == 1){
4214 $mbias_1{CpG}->{$index+1}->{meth}++;
4215 }
4216 else{
4217 $mbias_2{CpG}->{$index+1}->{meth}++;
4218 }
4219 }
4220 elsif ($methylation_calls[$index] eq 'z') {
4221 $counting{total_unmethylated_CpG_count}++;
4222 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4223 if ($read_identity == 1){
4224 $mbias_1{CpG}->{$index+1}->{un}++;
4225 }
4226 else{
4227 $mbias_2{CpG}->{$index+1}->{un}++;
4228 }
4229 }
4230 elsif ($methylation_calls[$index] eq 'H') {
4231 $counting{total_meCHH_count}++;
4232 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4233 if ($read_identity == 1){
4234 $mbias_1{CHH}->{$index+1}->{meth}++;
4235 }
4236 else{
4237 $mbias_2{CHH}->{$index+1}->{meth}++;
4238 }
4239 }
4240 elsif ($methylation_calls[$index] eq 'h') {
4241 $counting{total_unmethylated_CHH_count}++;
4242 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4243 if ($read_identity == 1){
4244 $mbias_1{CHH}->{$index+1}->{un}++;
4245 }
4246 else{
4247 $mbias_2{CHH}->{$index+1}->{un}++;
4248 }
4249 }
4250 elsif ($methylation_calls[$index] eq '.') {}
4251 elsif (lc$methylation_calls[$index] eq 'u'){}
4252 else{
4253 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4254 }
4255 }
4256 } else {
4257 die "The strand orientation as neither + nor -: '$strand'\n";
4258 }
4259 }
4260 }
4261 }
4262
4263 sub check_cigar_string {
4264 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
4265 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
4266 my ($new_cigar_offset,$new_pos_offset) = (0,0);
4267
4268 if ($strand eq '+') {
4269 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
4270
4271 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
4272 # warn "position needs no adjustment\n";
4273 }
4274
4275 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
4276 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
4277 # warn "adjusted genomic position by -1 bp (insertion)\n";
4278 }
4279
4280 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
4281 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
4282 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
4283 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
4284
4285 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
4286 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
4287 # warn "position needs no adjustment\n";
4288 last;
4289 }
4290 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
4291 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
4292 # warn "adjusted genomic position by another -1 bp (insertion)\n";
4293 last;
4294 }
4295 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
4296 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
4297 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
4298 # warn "adjusted genomic position by another +1 bp (deletion)\n";
4299 }
4300 else{
4301 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
4302 }
4303 }
4304 }
4305 else{
4306 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
4307 }
4308 }
4309
4310 elsif ($strand eq '-') {
4311 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
4312
4313 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
4314 # warn "position needs no adjustment\n";
4315 }
4316
4317 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
4318 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
4319 # warn "adjusted genomic position by +1 bp (insertion)\n";
4320 }
4321
4322 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
4323 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
4324 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
4325 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
4326
4327 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
4328 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
4329 # warn "Found new 'M' operation; position needs no adjustment\n";
4330 last;
4331 }
4332 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
4333 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
4334 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
4335 last;
4336 }
4337 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
4338 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
4339 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
4340 # warn "adjusted genomic position by another -1 bp (deletion)\n";
4341 }
4342 else{
4343 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
4344 }
4345 }
4346 }
4347 else{
4348 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
4349 }
4350 }
4351 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
4352 return ($new_cigar_offset,$new_pos_offset);
4353 }
4354
4355 sub print_individual_C_methylation_states_single_end{
4356
4357 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
4358 my @methylation_calls = split(//,$meth_call);
4359
4360 #################################################################
4361 ### . for bases not involving cytosines ###
4362 ### X for methylated C in CHG context (was protected) ###
4363 ### x for not methylated C in CHG context (was converted) ###
4364 ### H for methylated C in CHH context (was protected) ###
4365 ### h for not methylated C in CHH context (was converted) ###
4366 ### Z for methylated C in CpG context (was protected) ###
4367 ### z for not methylated C in CpG context (was converted) ###
4368 #################################################################
4369
4370 my $methyl_CHG_count = 0;
4371 my $methyl_CHH_count = 0;
4372 my $methyl_CpG_count = 0;
4373 my $unmethylated_CHG_count = 0;
4374 my $unmethylated_CHH_count = 0;
4375 my $unmethylated_CpG_count = 0;
4376
4377 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
4378 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
4379
4380 my @comp_cigar;
4381
4382 if ($cigar){ # parsing CIGAR string
4383
4384 ### Checking whether the CIGAR string is a linear genomic match or whether it requires indel processing
4385 if ($cigar =~ /^\d+M$/){
4386 # warn "See!? I told you so! $cigar\n";
4387 # sleep(1);
4388 }
4389 else{
4390
4391 my @len;
4392 my @ops;
4393
4394 @len = split (/\D+/,$cigar); # storing the length per operation
4395 @ops = split (/\d+/,$cigar); # storing the operation
4396 shift @ops; # remove the empty first element
4397 # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops);
4398 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
4399
4400 foreach my $index (0..$#len){
4401 foreach (1..$len[$index]){
4402 # print "$ops[$index]";
4403 push @comp_cigar, $ops[$index];
4404 }
4405 }
4406 }
4407 # warn "\nDetected CIGAR string: $cigar\n";
4408 # warn "Length of methylation call: ",length $meth_call,"\n";
4409 # warn "number of operations: ",scalar @ops,"\n";
4410 # warn "number of length digits: ",scalar @len,"\n\n";
4411 # print @comp_cigar,"\n";
4412 # print "$meth_call\n\n";
4413 # sleep (1);
4414 }
4415
4416 ### adjusting the start position for all reads mapping to the reverse strand
4417 if ($strand eq '-') {
4418
4419 if (@comp_cigar){ # only needed for SAM reads with InDels
4420 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
4421 # print @comp_cigar,"\n";
4422 }
4423
4424 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
4425
4426 if ($cigar){ ### SAM format
4427 if ($cigar =~ /^(\d+)M$/){ # linear match
4428 $start += $1 - 1;
4429 }
4430 else{ # InDel read
4431 my $MD_count = 0;
4432 foreach (@comp_cigar){
4433 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
4434 }
4435 $start += $MD_count - 1;
4436 }
4437 }
4438 else{ ### vanilla format
4439 $start += length($meth_call)-1;
4440 }
4441 }
4442 }
4443
4444 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
4445
4446 ### single-file CpG and other-context output
4447 if ($full and $merge_non_CpG) {
4448 if ($strand eq '+') {
4449 for my $index (0..$#methylation_calls) {
4450
4451 if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels
4452 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4453 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
4454 $cigar_offset += $cigar_mod;
4455 $pos_offset += $pos_mod;
4456 }
4457
4458 ### methylated Cs (any context) will receive a forward (+) orientation
4459 ### not methylated Cs (any context) will receive a reverse (-) orientation
4460 if ($methylation_calls[$index] eq 'X') {
4461 $counting{total_meCHG_count}++;
4462 my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
4463 print {$fhs{other_context}} $line unless($mbias_only);
4464 # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4465 $mbias_1{CHG}->{$index+1}->{meth}++;
4466 }
4467 elsif ($methylation_calls[$index] eq 'x') {
4468 $counting{total_unmethylated_CHG_count}++;
4469 my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
4470 print {$fhs{other_context}} $line unless($mbias_only);
4471 # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4472 $mbias_1{CHG}->{$index+1}->{un}++;
4473 }
4474 elsif ($methylation_calls[$index] eq 'Z') {
4475 $counting{total_meCpG_count}++;
4476 my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
4477 print {$fhs{CpG_context}} $line unless($mbias_only);
4478 # print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4479 $mbias_1{CpG}->{$index+1}->{meth}++;
4480 }
4481 elsif ($methylation_calls[$index] eq 'z') {
4482 $counting{total_unmethylated_CpG_count}++;
4483 my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
4484 print {$fhs{CpG_context}} $line unless($mbias_only);
4485 # print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4486 $mbias_1{CpG}->{$index+1}->{un}++;
4487 }
4488 elsif ($methylation_calls[$index] eq 'H') {
4489 $counting{total_meCHH_count}++;
4490 my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
4491 print {$fhs{other_context}} $line unless($mbias_only);
4492 # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4493 $mbias_1{CHH}->{$index+1}->{meth}++;
4494 }
4495 elsif ($methylation_calls[$index] eq 'h') {
4496 $counting{total_unmethylated_CHH_count}++;
4497 my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n";
4498 print {$fhs{other_context}} $line unless($mbias_only);
4499 # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4500 $mbias_1{CHH}->{$index+1}->{un}++;
4501 }
4502 elsif ($methylation_calls[$index] eq '.') {}
4503 elsif (lc$methylation_calls[$index] eq 'u'){}
4504 else{
4505 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4506 }
4507 }
4508 }
4509 elsif ($strand eq '-') {
4510
4511 for my $index (0..$#methylation_calls) {
4512 ### methylated Cs (any context) will receive a forward (+) orientation
4513 ### not methylated Cs (any context) will receive a reverse (-) orientation
4514
4515 if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels
4516 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
4517 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4518 $cigar_offset += $cigar_mod;
4519 $pos_offset += $pos_mod;
4520 }
4521
4522 if ($methylation_calls[$index] eq 'X') {
4523 $counting{total_meCHG_count}++;
4524 my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
4525 print {$fhs{other_context}} $line unless($mbias_only);
4526 #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4527 $mbias_1{CHG}->{$index+1}->{meth}++;
4528 }
4529 elsif ($methylation_calls[$index] eq 'x') {
4530 $counting{total_unmethylated_CHG_count}++;
4531 my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
4532 print {$fhs{other_context}} $line unless($mbias_only);
4533 #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4534 $mbias_1{CHG}->{$index+1}->{un}++;
4535 }
4536 elsif ($methylation_calls[$index] eq 'Z') {
4537 $counting{total_meCpG_count}++;
4538 my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
4539 print {$fhs{CpG_context}} $line unless($mbias_only);
4540 #print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4541 $mbias_1{CpG}->{$index+1}->{meth}++;
4542 }
4543 elsif ($methylation_calls[$index] eq 'z') {
4544 $counting{total_unmethylated_CpG_count}++;
4545 my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
4546 print {$fhs{CpG_context}} $line unless($mbias_only);
4547 #print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4548 $mbias_1{CpG}->{$index+1}->{un}++;
4549 }
4550 elsif ($methylation_calls[$index] eq 'H') {
4551 $counting{total_meCHH_count}++;
4552 my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
4553 print {$fhs{other_context}} $line unless($mbias_only);
4554 #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4555 $mbias_1{CHH}->{$index+1}->{meth}++;
4556 }
4557 elsif ($methylation_calls[$index] eq 'h') {
4558 $counting{total_unmethylated_CHH_count}++;
4559 my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n";
4560 print {$fhs{other_context}} $line unless($mbias_only);
4561 #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4562 $mbias_1{CHH}->{$index+1}->{un}++;
4563 }
4564 elsif ($methylation_calls[$index] eq '.'){}
4565 elsif (lc$methylation_calls[$index] eq 'u'){}
4566 else{
4567 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4568 }
4569 }
4570 }
4571 else {
4572 die "The strand information was neither + nor -: $strand\n";
4573 }
4574 }
4575
4576 ### strand-specific methylation output
4577 elsif ($merge_non_CpG) {
4578 if ($strand eq '+') {
4579 for my $index (0..$#methylation_calls) {
4580 ### methylated Cs (any context) will receive a forward (+) orientation
4581 ### not methylated Cs (any context) will receive a reverse (-) orientation
4582
4583 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4584 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4585 $cigar_offset += $cigar_mod;
4586 $pos_offset += $pos_mod;
4587 }
4588
4589 if ($methylation_calls[$index] eq 'X') {
4590 $counting{total_meCHG_count}++;
4591 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4592 $mbias_1{CHG}->{$index+1}->{meth}++;
4593 }
4594 elsif ($methylation_calls[$index] eq 'x') {
4595 $counting{total_unmethylated_CHG_count}++;
4596 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4597 $mbias_1{CHG}->{$index+1}->{un}++;
4598 }
4599 elsif ($methylation_calls[$index] eq 'Z') {
4600 $counting{total_meCpG_count}++;
4601 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4602 $mbias_1{CpG}->{$index+1}->{meth}++;
4603 }
4604 elsif ($methylation_calls[$index] eq 'z') {
4605 $counting{total_unmethylated_CpG_count}++;
4606 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4607 $mbias_1{CpG}->{$index+1}->{un}++;
4608 }
4609 elsif ($methylation_calls[$index] eq 'H') {
4610 $counting{total_meCHH_count}++;
4611 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4612 $mbias_1{CHH}->{$index+1}->{meth}++;
4613 }
4614 elsif ($methylation_calls[$index] eq 'h') {
4615 $counting{total_unmethylated_CHH_count}++;
4616 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4617 $mbias_1{CHH}->{$index+1}->{un}++;
4618 }
4619 elsif ($methylation_calls[$index] eq '.') {}
4620 elsif (lc$methylation_calls[$index] eq 'u'){}
4621 else{
4622 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4623 }
4624 }
4625 }
4626 elsif ($strand eq '-') {
4627
4628 for my $index (0..$#methylation_calls) {
4629 ### methylated Cs (any context) will receive a forward (+) orientation
4630 ### not methylated Cs (any context) will receive a reverse (-) orientation
4631
4632 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4633 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4634 $cigar_offset += $cigar_mod;
4635 $pos_offset += $pos_mod;
4636 }
4637
4638 if ($methylation_calls[$index] eq 'X') {
4639 $counting{total_meCHG_count}++;
4640 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4641 $mbias_1{CHG}->{$index+1}->{meth}++;
4642 }
4643 elsif ($methylation_calls[$index] eq 'x') {
4644 $counting{total_unmethylated_CHG_count}++;
4645 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4646 $mbias_1{CHG}->{$index+1}->{un}++;
4647 }
4648 elsif ($methylation_calls[$index] eq 'Z') {
4649 $counting{total_meCpG_count}++;
4650 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4651 $mbias_1{CpG}->{$index+1}->{meth}++;
4652 }
4653 elsif ($methylation_calls[$index] eq 'z') {
4654 $counting{total_unmethylated_CpG_count}++;
4655 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4656 $mbias_1{CpG}->{$index+1}->{un}++;
4657 }
4658 elsif ($methylation_calls[$index] eq 'H') {
4659 $counting{total_meCHH_count}++;
4660 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4661 $mbias_1{CHH}->{$index+1}->{meth}++;
4662 }
4663 elsif ($methylation_calls[$index] eq 'h') {
4664 $counting{total_unmethylated_CHH_count}++;
4665 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4666 $mbias_1{CHH}->{$index+1}->{un}++;
4667 }
4668 elsif ($methylation_calls[$index] eq '.') {}
4669 elsif (lc$methylation_calls[$index] eq 'u'){}
4670 else{
4671 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4672 }
4673 }
4674 }
4675 else {
4676 die "The strand information was neither + nor -: $strand\n";
4677 }
4678 }
4679
4680 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
4681
4682 elsif ($full) {
4683 if ($strand eq '+') {
4684 for my $index (0..$#methylation_calls) {
4685 ### methylated Cs (any context) will receive a forward (+) orientation
4686 ### not methylated Cs (any context) will receive a reverse (-) orientation
4687
4688 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4689 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4690 $cigar_offset += $cigar_mod;
4691 $pos_offset += $pos_mod;
4692 }
4693
4694 if ($methylation_calls[$index] eq 'X') {
4695 $counting{total_meCHG_count}++;
4696 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4697 $mbias_1{CHG}->{$index+1}->{meth}++;
4698 }
4699 elsif ($methylation_calls[$index] eq 'x') {
4700 $counting{total_unmethylated_CHG_count}++;
4701 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4702 $mbias_1{CHG}->{$index+1}->{un}++;
4703 }
4704 elsif ($methylation_calls[$index] eq 'Z') {
4705 $counting{total_meCpG_count}++;
4706 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4707 $mbias_1{CpG}->{$index+1}->{meth}++;
4708 }
4709 elsif ($methylation_calls[$index] eq 'z') {
4710 $counting{total_unmethylated_CpG_count}++;
4711 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4712 $mbias_1{CpG}->{$index+1}->{un}++;
4713 }
4714 elsif ($methylation_calls[$index] eq 'H') {
4715 $counting{total_meCHH_count}++;
4716 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4717 $mbias_1{CHH}->{$index+1}->{meth}++;
4718 }
4719 elsif ($methylation_calls[$index] eq 'h') {
4720 $counting{total_unmethylated_CHH_count}++;
4721 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4722 $mbias_1{CHH}->{$index+1}->{un}++;
4723 }
4724 elsif ($methylation_calls[$index] eq '.') {}
4725 elsif (lc$methylation_calls[$index] eq 'u'){}
4726 else{
4727 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
4728 }
4729 }
4730 }
4731 elsif ($strand eq '-') {
4732
4733 for my $index (0..$#methylation_calls) {
4734 ### methylated Cs (any context) will receive a forward (+) orientation
4735 ### not methylated Cs (any context) will receive a reverse (-) orientation
4736
4737 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4738 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4739 $cigar_offset += $cigar_mod;
4740 $pos_offset += $pos_mod;
4741 }
4742
4743 if ($methylation_calls[$index] eq 'X') {
4744 $counting{total_meCHG_count}++;
4745 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4746 $mbias_1{CHG}->{$index+1}->{meth}++;
4747 }
4748 elsif ($methylation_calls[$index] eq 'x') {
4749 $counting{total_unmethylated_CHG_count}++;
4750 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4751 $mbias_1{CHG}->{$index+1}->{un}++;
4752 }
4753 elsif ($methylation_calls[$index] eq 'Z') {
4754 $counting{total_meCpG_count}++;
4755 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4756 $mbias_1{CpG}->{$index+1}->{meth}++;
4757 }
4758 elsif ($methylation_calls[$index] eq 'z') {
4759 $counting{total_unmethylated_CpG_count}++;
4760 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4761 $mbias_1{CpG}->{$index+1}->{un}++;
4762 }
4763 elsif ($methylation_calls[$index] eq 'H') {
4764 $counting{total_meCHH_count}++;
4765 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4766 $mbias_1{CHH}->{$index+1}->{meth}++;
4767 }
4768 elsif ($methylation_calls[$index] eq 'h') {
4769 $counting{total_unmethylated_CHH_count}++;
4770 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4771 $mbias_1{CHH}->{$index+1}->{un}++;
4772 }
4773 elsif ($methylation_calls[$index] eq '.') {}
4774 elsif (lc$methylation_calls[$index] eq 'u'){}
4775 else{
4776 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4777 }
4778 }
4779 }
4780 else {
4781 die "The read had a strand orientation which was neither + nor -: $strand\n";
4782 }
4783 }
4784
4785 ### strand-specific methylation output
4786 else {
4787 if ($strand eq '+') {
4788 for my $index (0..$#methylation_calls) {
4789 ### methylated Cs (any context) will receive a forward (+) orientation
4790 ### not methylated Cs (any context) will receive a reverse (-) orientation
4791
4792 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4793 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4794 $cigar_offset += $cigar_mod;
4795 $pos_offset += $pos_mod;
4796 }
4797
4798 if ($methylation_calls[$index] eq 'X') {
4799 $counting{total_meCHG_count}++;
4800 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4801 $mbias_1{CHG}->{$index+1}->{meth}++;
4802 }
4803 elsif ($methylation_calls[$index] eq 'x') {
4804 $counting{total_unmethylated_CHG_count}++;
4805 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4806 $mbias_1{CHG}->{$index+1}->{un}++;
4807 }
4808 elsif ($methylation_calls[$index] eq 'Z') {
4809 $counting{total_meCpG_count}++;
4810 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4811 $mbias_1{CpG}->{$index+1}->{meth}++;
4812 }
4813 elsif ($methylation_calls[$index] eq 'z') {
4814 $counting{total_unmethylated_CpG_count}++;
4815 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4816 $mbias_1{CpG}->{$index+1}->{un}++;
4817 }
4818 elsif ($methylation_calls[$index] eq 'H') {
4819 $counting{total_meCHH_count}++;
4820 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4821 $mbias_1{CHH}->{$index+1}->{meth}++;
4822 }
4823 elsif ($methylation_calls[$index] eq 'h') {
4824 $counting{total_unmethylated_CHH_count}++;
4825 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4826 $mbias_1{CHH}->{$index+1}->{un}++;
4827 }
4828 elsif ($methylation_calls[$index] eq '.') {}
4829 elsif (lc$methylation_calls[$index] eq 'u'){}
4830 else{
4831 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4832 }
4833 }
4834 }
4835 elsif ($strand eq '-') {
4836
4837 for my $index (0..$#methylation_calls) {
4838 ### methylated Cs (any context) will receive a forward (+) orientation
4839 ### not methylated Cs (any context) will receive a reverse (-) orientation
4840
4841 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4842 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4843 $cigar_offset += $cigar_mod;
4844 $pos_offset += $pos_mod;
4845 }
4846
4847 if ($methylation_calls[$index] eq 'X') {
4848 $counting{total_meCHG_count}++;
4849 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4850 $mbias_1{CHG}->{$index+1}->{meth}++;
4851 }
4852 elsif ($methylation_calls[$index] eq 'x') {
4853 $counting{total_unmethylated_CHG_count}++;
4854 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4855 $mbias_1{CHG}->{$index+1}->{un}++;
4856 }
4857 elsif ($methylation_calls[$index] eq 'Z') {
4858 $counting{total_meCpG_count}++;
4859 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4860 $mbias_1{CpG}->{$index+1}->{meth}++;
4861 }
4862 elsif ($methylation_calls[$index] eq 'z') {
4863 $counting{total_unmethylated_CpG_count}++;
4864 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4865 $mbias_1{CpG}->{$index+1}->{un}++;
4866 }
4867 elsif ($methylation_calls[$index] eq 'H') {
4868 $counting{total_meCHH_count}++;
4869 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4870 $mbias_1{CHH}->{$index+1}->{meth}++;
4871 }
4872 elsif ($methylation_calls[$index] eq 'h') {
4873 $counting{total_unmethylated_CHH_count}++;
4874 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only);
4875 $mbias_1{CHH}->{$index+1}->{un}++;
4876 }
4877 elsif ($methylation_calls[$index] eq '.') {}
4878 elsif (lc$methylation_calls[$index] eq 'u'){}
4879 else{
4880 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4881 }
4882 }
4883 }
4884 else {
4885 die "The strand information was neither + nor -: $strand\n";
4886 }
4887 }
4888 }
4889
4890 sub open_output_filehandles{
4891
4892 my $filename = shift;
4893
4894 my $output_filename = (split (/\//,$filename))[-1];
4895 my $report_filename = $output_filename;
4896
4897 ### OPENING OUTPUT-FILEHANDLES
4898 if ($report) {
4899 $report_filename =~ s/\.sam$//;
4900 $report_filename =~ s/\.txt$//;
4901 $report_filename =~ s/$/_splitting_report.txt/;
4902 $report_filename = $output_dir . $report_filename;
4903 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
4904 }
4905
4906 if ($report) {
4907
4908 print REPORT "$output_filename\n\n";
4909 print REPORT "Parameters used to extract methylation information:\n";
4910 print REPORT "Bismark Extractor Version: $version\n";
4911
4912 if ($paired) {
4913 if ($vanilla) {
4914 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
4915 } else {
4916 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
4917 }
4918 }
4919
4920 if ($single) {
4921 if ($vanilla) {
4922 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
4923 } else {
4924 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
4925 }
4926 }
4927 if ($single){
4928 if ($ignore) {
4929 print REPORT "Ignoring first $ignore bp\n";
4930 }
4931 if ($ignore_3prime) {
4932 print REPORT "Ignoring last $ignore_3prime bp\n";
4933 }
4934 }
4935 else{ # paired-end
4936 if ($ignore) {
4937 print REPORT "Ignoring first $ignore bp of Read 1\n";
4938 }
4939 if ($ignore_r2){
4940 print REPORT "Ignoring first $ignore_r2 bp of Read 2\n";
4941 }
4942
4943 if ($ignore_3prime) {
4944 print REPORT "Ignoring last $ignore_3prime bp of Read 1\n";
4945 }
4946 if ($ignore_3prime_r2){
4947 print REPORT "Ignoring last $ignore_3prime_r2 bp of Read 2\n";
4948 }
4949
4950 }
4951
4952 if ($full) {
4953 print REPORT "Output specified: comprehensive\n";
4954 } else {
4955 print REPORT "Output specified: strand-specific (default)\n";
4956 }
4957
4958 if ($no_overlap) {
4959 print REPORT "No overlapping methylation calls specified\n";
4960 }
4961 if ($genomic_fasta) {
4962 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
4963 }
4964 if ($merge_non_CpG) {
4965 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
4966 }
4967
4968 print REPORT "\n";
4969 }
4970
4971 ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
4972
4973 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
4974 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
4975 if ($full and $merge_non_CpG) {
4976 my $cpg_output = my $other_c_output = $output_filename;
4977 ### C in CpG context
4978 $cpg_output =~ s/^/CpG_context_/;
4979 $cpg_output =~ s/sam$/txt/;
4980 $cpg_output =~ s/bam$/txt/;
4981 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
4982 $cpg_output = $output_dir . $cpg_output;
4983
4984 if ($gzip){
4985 $cpg_output .= '.gz';
4986 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
4987 }
4988 else{ ### disclaimer: I am aware of "The Useless Use of Cat Awards", but I saw no other option...
4989 open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
4990 # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
4991 }
4992
4993 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
4994 push @sorting_files,$cpg_output;
4995
4996 unless ($no_header) {
4997 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
4998 }
4999
5000 ### C in any other context than CpG
5001 $other_c_output =~ s/^/Non_CpG_context_/;
5002 $other_c_output =~ s/sam$/txt/;
5003 $other_c_output =~ s/bam$/txt/;
5004 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
5005 $other_c_output = $output_dir . $other_c_output;
5006
5007 if ($gzip){
5008 $other_c_output .= '.gz';
5009 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only);
5010 }
5011 else{
5012 open ($fhs{other_context},"| cat > $other_c_output") or die "Failed to write to $other_c_output $!\n" unless($mbias_only);
5013 # open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only);
5014 }
5015
5016 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only);
5017 push @sorting_files,$other_c_output;
5018
5019
5020 unless ($no_header) {
5021 print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5022 }
5023 }
5024
5025 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
5026 elsif ($merge_non_CpG) {
5027
5028 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
5029
5030 ### For cytosines in CpG context
5031 $cpg_ot =~ s/^/CpG_OT_/;
5032 $cpg_ot =~ s/sam$/txt/;
5033 $cpg_ot =~ s/bam$/txt/;
5034 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
5035 $cpg_ot = $output_dir . $cpg_ot;
5036
5037 if ($gzip){
5038 $cpg_ot .= '.gz';
5039 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
5040 }
5041 else{
5042 open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
5043 # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
5044 }
5045
5046 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
5047 push @sorting_files,$cpg_ot;
5048
5049 unless($no_header){
5050 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5051 }
5052
5053 $cpg_ctot =~ s/^/CpG_CTOT_/;
5054 $cpg_ctot =~ s/sam$/txt/;
5055 $cpg_ctot =~ s/bam$/txt/;
5056 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
5057 $cpg_ctot = $output_dir . $cpg_ctot;
5058
5059 if ($gzip){
5060 $cpg_ctot .= '.gz';
5061 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
5062 }
5063 else{
5064 open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
5065 # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
5066 }
5067
5068 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
5069 push @sorting_files,$cpg_ctot;
5070
5071 unless($no_header){
5072 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5073 }
5074
5075 $cpg_ctob =~ s/^/CpG_CTOB_/;
5076 $cpg_ctob =~ s/sam$/txt/;
5077 $cpg_ctob =~ s/bam$/txt/;
5078 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
5079 $cpg_ctob = $output_dir . $cpg_ctob;
5080
5081 if ($gzip){
5082 $cpg_ctob .= '.gz';
5083 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
5084 }
5085 else{
5086 open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
5087 # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
5088 }
5089
5090 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
5091 push @sorting_files,$cpg_ctob;
5092
5093 unless($no_header){
5094 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5095 }
5096
5097 $cpg_ob =~ s/^/CpG_OB_/;
5098 $cpg_ob =~ s/sam$/txt/;
5099 $cpg_ob =~ s/bam$/txt/;
5100 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
5101 $cpg_ob = $output_dir . $cpg_ob;
5102
5103 if ($gzip){
5104 $cpg_ob .= '.gz';
5105 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
5106 }
5107 else{
5108 open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
5109 # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
5110 }
5111
5112 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
5113 push @sorting_files,$cpg_ob;
5114
5115 unless($no_header){
5116 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5117 }
5118
5119 ### For cytosines in Non-CpG (CC, CT or CA) context
5120 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
5121
5122 $other_c_ot =~ s/^/Non_CpG_OT_/;
5123 $other_c_ot =~ s/sam$/txt/;
5124 $other_c_ot =~ s/bam$/txt/;
5125 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
5126 $other_c_ot = $output_dir . $other_c_ot;
5127
5128 if ($gzip){
5129 $other_c_ot .= '.gz';
5130 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
5131 }
5132 else{
5133 open ($fhs{0}->{other_c},"| cat > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
5134 # open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
5135 }
5136
5137 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only);
5138 push @sorting_files,$other_c_ot;
5139
5140 unless($no_header){
5141 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5142 }
5143
5144 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
5145 $other_c_ctot =~ s/sam$/txt/;
5146 $other_c_ctot =~ s/bam$/txt/;
5147 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
5148 $other_c_ctot = $output_dir . $other_c_ctot;
5149
5150 if ($gzip){
5151 $other_c_ctot .= '.gz';
5152 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
5153 }
5154 else{
5155 open ($fhs{1}->{other_c},"| cat > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
5156 # open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
5157 }
5158
5159 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only);
5160 push @sorting_files,$other_c_ctot;
5161
5162 unless($no_header){
5163 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5164 }
5165
5166 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
5167 $other_c_ctob =~ s/sam$/txt/;
5168 $other_c_ctob =~ s/bam$/txt/;
5169 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
5170 $other_c_ctob = $output_dir . $other_c_ctob;
5171
5172 if ($gzip){
5173 $other_c_ctob .= '.gz';
5174 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
5175 }
5176 else{
5177 open ($fhs{2}->{other_c},"| cat > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
5178 # open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
5179 }
5180
5181 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only);
5182 push @sorting_files,$other_c_ctob;
5183
5184 unless($no_header){
5185 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5186 }
5187
5188 $other_c_ob =~ s/^/Non_CpG_OB_/;
5189 $other_c_ob =~ s/sam$/txt/;
5190 $other_c_ob =~ s/sam$/txt/;
5191 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
5192 $other_c_ob = $output_dir . $other_c_ob;
5193
5194 if ($gzip){
5195 $other_c_ob .= '.gz';
5196 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
5197 }
5198 else{
5199 open ($fhs{3}->{other_c},"| cat > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
5200 # open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
5201 }
5202
5203 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only);
5204 push @sorting_files,$other_c_ob;
5205
5206 unless($no_header){
5207 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5208 }
5209 }
5210 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
5211
5212 ### if --comprehensive was specified we are only writing one file per context
5213 elsif ($full) {
5214 my $cpg_output = my $chg_output = my $chh_output = $output_filename;
5215 ### C in CpG context
5216 $cpg_output =~ s/^/CpG_context_/;
5217 $cpg_output =~ s/sam$/txt/;
5218 $cpg_output =~ s/bam$/txt/;
5219 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
5220 $cpg_output = $output_dir . $cpg_output;
5221
5222 if ($gzip){
5223 $cpg_output .= '.gz';
5224 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
5225 }
5226 else{
5227 open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
5228 # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
5229 }
5230
5231 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
5232 push @sorting_files,$cpg_output;
5233
5234 unless($no_header){
5235 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5236 }
5237
5238 ### C in CHG context
5239 $chg_output =~ s/^/CHG_context_/;
5240 $chg_output =~ s/sam$/txt/;
5241 $chg_output =~ s/bam$/txt/;
5242 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
5243 $chg_output = $output_dir . $chg_output;
5244
5245 if ($gzip){
5246 $chg_output .= '.gz';
5247 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only);
5248 }
5249 else{
5250 open ($fhs{CHG_context},"| cat > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only);
5251 # open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only);
5252 }
5253
5254 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only);
5255 push @sorting_files,$chg_output;
5256
5257 unless($no_header){
5258 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5259 }
5260
5261 ### C in CHH context
5262 $chh_output =~ s/^/CHH_context_/;
5263 $chh_output =~ s/sam$/txt/;
5264 $chh_output =~ s/bam$/txt/;
5265 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
5266 $chh_output = $output_dir . $chh_output;
5267
5268 if ($gzip){
5269 $chh_output .= '.gz';
5270 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only);
5271 }
5272 else{
5273 open ($fhs{CHH_context},"| cat > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only);
5274 # open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only);
5275 }
5276
5277 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only);
5278 push @sorting_files, $chh_output;
5279
5280 unless($no_header){
5281 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5282 }
5283 }
5284 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
5285 else {
5286 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
5287
5288 ### For cytosines in CpG context
5289 $cpg_ot =~ s/^/CpG_OT_/;
5290 $cpg_ot =~ s/sam$/txt/;
5291 $cpg_ot =~ s/bam$/txt/;
5292 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
5293 $cpg_ot = $output_dir . $cpg_ot;
5294
5295 if ($gzip){
5296 $cpg_ot .= '.gz';
5297 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
5298 }
5299 else{
5300 open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
5301 # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
5302 }
5303
5304 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
5305 push @sorting_files,$cpg_ot;
5306
5307 unless($no_header){
5308 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5309 }
5310
5311 $cpg_ctot =~ s/^/CpG_CTOT_/;
5312 $cpg_ctot =~ s/sam$/txt/;
5313 $cpg_ctot =~ s/bam$/txt/;
5314 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
5315 $cpg_ctot = $output_dir . $cpg_ctot;
5316
5317 if ($gzip){
5318 $cpg_ctot .= '.gz';
5319 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
5320 }
5321 else{
5322 open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
5323 # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
5324 }
5325
5326 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
5327 push @sorting_files,$cpg_ctot;
5328
5329 unless($no_header){
5330 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5331 }
5332
5333 $cpg_ctob =~ s/^/CpG_CTOB_/;
5334 $cpg_ctob =~ s/sam$/txt/;
5335 $cpg_ctob =~ s/bam$/txt/;
5336 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
5337 $cpg_ctob = $output_dir . $cpg_ctob;
5338
5339 if ($gzip){
5340 $cpg_ctob .= '.gz';
5341 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
5342 }
5343 else{
5344 open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
5345 # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
5346 }
5347
5348 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
5349 push @sorting_files,$cpg_ctob;
5350
5351 unless($no_header){
5352 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5353 }
5354
5355 $cpg_ob =~ s/^/CpG_OB_/;
5356 $cpg_ob =~ s/sam$/txt/;
5357 $cpg_ob =~ s/bam$/txt/;
5358 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
5359 $cpg_ob = $output_dir . $cpg_ob;
5360
5361 if ($gzip){
5362 $cpg_ob .= '.gz';
5363 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
5364 }
5365 else{
5366 open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
5367 # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
5368 }
5369
5370 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
5371 push @sorting_files,$cpg_ob;
5372
5373 unless($no_header){
5374 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5375 }
5376
5377 ### For cytosines in CHG context
5378 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
5379
5380 $chg_ot =~ s/^/CHG_OT_/;
5381 $chg_ot =~ s/sam$/txt/;
5382 $chg_ot =~ s/bam$/txt/;
5383 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
5384 $chg_ot = $output_dir . $chg_ot;
5385
5386 if ($gzip){
5387 $chg_ot .= '.gz';
5388 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
5389 }
5390 else{
5391 open ($fhs{0}->{CHG},"| cat > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
5392 # open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
5393 }
5394
5395 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only);
5396 push @sorting_files,$chg_ot;
5397
5398 unless($no_header){
5399 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5400 }
5401
5402 $chg_ctot =~ s/^/CHG_CTOT_/;
5403 $chg_ctot =~ s/sam$/txt/;
5404 $chg_ctot =~ s/bam$/txt/;
5405 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
5406 $chg_ctot = $output_dir . $chg_ctot;
5407
5408 if ($gzip){
5409 $chg_ctot .= '.gz';
5410 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
5411 }
5412 else{
5413 open ($fhs{1}->{CHG},"| cat > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
5414 # open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
5415 }
5416
5417 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only);
5418 push @sorting_files,$chg_ctot;
5419
5420 unless($no_header){
5421 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5422 }
5423
5424 $chg_ctob =~ s/^/CHG_CTOB_/;
5425 $chg_ctob =~ s/sam$/txt/;
5426 $chg_ctob =~ s/bam$/txt/;
5427 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
5428 $chg_ctob = $output_dir . $chg_ctob;
5429
5430 if ($gzip){
5431 $chg_ctob .= '.gz';
5432 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
5433 }
5434 else{
5435 open ($fhs{2}->{CHG},"| cat > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
5436 # open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
5437 }
5438
5439 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only);
5440 push @sorting_files,$chg_ctob;
5441
5442 unless($no_header){
5443 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5444 }
5445
5446 $chg_ob =~ s/^/CHG_OB_/;
5447 $chg_ob =~ s/sam$/txt/;
5448 $chg_ob =~ s/bam$/txt/;
5449 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
5450 $chg_ob = $output_dir . $chg_ob;
5451
5452 if ($gzip){
5453 $chg_ob .= '.gz';
5454 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
5455 }
5456 else{
5457 open ($fhs{3}->{CHG},"| cat > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
5458 # open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
5459 }
5460
5461 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only);
5462 push @sorting_files,$chg_ob;
5463
5464 unless($no_header){
5465 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5466 }
5467
5468 ### For cytosines in CHH context
5469 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
5470
5471 $chh_ot =~ s/^/CHH_OT_/;
5472 $chh_ot =~ s/sam$/txt/;
5473 $chh_ot =~ s/bam$/txt/;
5474 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
5475 $chh_ot = $output_dir . $chh_ot;
5476
5477 if ($gzip){
5478 $chh_ot .= '.gz';
5479 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
5480 }
5481 else{
5482 open ($fhs{0}->{CHH},"| cat > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
5483 # open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
5484 }
5485
5486 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only);
5487 push @sorting_files,$chh_ot;
5488
5489 unless($no_header){
5490 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5491 }
5492
5493 $chh_ctot =~ s/^/CHH_CTOT_/;
5494 $chh_ctot =~ s/sam$/txt/;
5495 $chh_ctot =~ s/bam$/txt/;
5496 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
5497 $chh_ctot = $output_dir . $chh_ctot;
5498
5499 if ($gzip){
5500 $chh_ctot .= '.gz';
5501 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
5502 }
5503 else{
5504 open ($fhs{1}->{CHH},"| cat > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
5505 # open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
5506 }
5507
5508 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only);
5509 push @sorting_files,$chh_ctot;
5510
5511 unless($no_header){
5512 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5513 }
5514
5515 $chh_ctob =~ s/^/CHH_CTOB_/;
5516 $chh_ctob =~ s/sam$/txt/;
5517 $chh_ctob =~ s/bam$/txt/;
5518 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
5519 $chh_ctob = $output_dir . $chh_ctob;
5520
5521 if ($gzip){
5522 $chh_ctob .= '.gz';
5523 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
5524 }
5525 else{
5526 open ($fhs{2}->{CHH},"| cat > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
5527 # open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
5528 }
5529
5530 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only);
5531 push @sorting_files,$chh_ctob;
5532
5533 unless($no_header){
5534 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5535 }
5536
5537 $chh_ob =~ s/^/CHH_OB_/;
5538 $chh_ob =~ s/sam$/txt/;
5539 $chh_ob =~ s/bam$/txt/;
5540 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
5541 $chh_ob = $output_dir . $chh_ob;
5542
5543 if ($gzip){
5544 $chh_ob .= '.gz';
5545 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
5546 }
5547 else{
5548 open ($fhs{3}->{CHH},"| cat > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
5549 # open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
5550 }
5551
5552 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only);
5553 push @sorting_files,$chh_ob;
5554
5555 unless($no_header){
5556 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
5557 }
5558 }
5559 return $report_filename;
5560 }
5561
5562 sub isBam{
5563
5564 my $filename = shift;
5565
5566 # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy)
5567 open (DISGUISE,"zcat $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n";
5568
5569 ### when BAM files read through a zcat stream they start with BAM...
5570 my $bam_in_disguise = <DISGUISE>;
5571 # warn "BAM in disguise: $bam_in_disguise\n\n";
5572
5573 if ($bam_in_disguise){
5574 if ($bam_in_disguise =~ /^BAM/){
5575 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
5576 return 1;
5577 }
5578 else{
5579 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
5580 return 0;
5581 }
5582 }
5583 else{
5584 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
5585 return 0;
5586 }
5587 }
5588
5589
5590 sub print_helpfile{
5591
5592 print << 'HOW_TO';
5593
5594
5595 DESCRIPTION
5596
5597 The following is a brief description of all options to control the Bismark
5598 methylation extractor. The script reads in a bisulfite read alignment results file
5599 produced by the Bismark bisulfite mapper and extracts the methylation information
5600 for individual cytosines. This information is found in the methylation call field
5601 which can contain the following characters:
5602
5603 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5604 ~~~ X for methylated C in CHG context ~~~
5605 ~~~ x for not methylated C CHG ~~~
5606 ~~~ H for methylated C in CHH context ~~~
5607 ~~~ h for not methylated C in CHH context ~~~
5608 ~~~ Z for methylated C in CpG context ~~~
5609 ~~~ z for not methylated C in CpG context ~~~
5610 ~~~ U for methylated C in Unknown context (CN or CHN ~~~
5611 ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~
5612 ~~~ . for any bases not involving cytosines ~~~
5613 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5614
5615 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
5616 context (this distinction is actually already made in Bismark itself). As the methylation
5617 information for every C analysed can produce files which easily have tens or even hundreds of
5618 millions of lines, file sizes can become very large and more difficult to handle. The C
5619 methylation info additionally splits cytosine methylation calls up into one of the four possible
5620 strands a given bisulfite read aligned against:
5621
5622 OT original top strand
5623 CTOT complementary to original top strand
5624
5625 OB original bottom strand
5626 CTOB complementary to original bottom strand
5627
5628 Thus, by default twelve individual output files are being generated per input file (unless
5629 --comprehensive is specified, see below). The output files can be imported into a genome
5630 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
5631 unless the bisulfite reads were generated preserving directionality it doesn't make any
5632 sense to look at the data in a strand-specific manner). Strand-specific output files can
5633 optionally be skipped, in which case only three output files for CpG, CHG or CHH context
5634 will be generated. For both the strand-specific and comprehensive outputs there is also
5635 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
5636
5637
5638 The output files are in the following format (tab delimited):
5639
5640 <sequence_id> <strand> <chromosome> <position> <methylation call>
5641
5642
5643 USAGE: methylation_extractor [options] <filenames>
5644
5645
5646 ARGUMENTS:
5647 ==========
5648
5649 <filenames> A space-separated list of Bismark result files in SAM format from
5650 which methylation information is extracted for every cytosine in
5651 the reads. For alignment files in the older custom Bismark output
5652 see option '--vanilla'.
5653
5654 OPTIONS:
5655
5656 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end
5657 read data. Specifying either --single-end or --paired-end is
5658 mandatory.
5659
5660 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
5661 read data. Specifying either --paired-end or --single-end is
5662 mandatory.
5663
5664 --vanilla The Bismark result input file(s) are in the old custom Bismark format
5665 (up to version 0.5.x) and not in SAM format which is the default as
5666 of Bismark version 0.6.x or higher. Default: OFF.
5667
5668 --no_overlap For paired-end reads it is theoretically possible that read_1 and
5669 read_2 overlap. This option avoids scoring overlapping methylation
5670 calls twice (only methylation calls of read 1 are used for in the process
5671 since read 1 has historically higher quality basecalls than read 2).
5672 Whilst this option removes a bias towards more methylation calls
5673 in the center of sequenced fragments it may de facto remove a sizable
5674 proportion of the data. This option is on by default for paired-end data
5675 but can be disabled using --include_overlap. Default: ON.
5676
5677 --include_overlap For paired-end data all methylation calls will be extracted irrespective of
5678 of whether they overlap or not. Default: OFF.
5679
5680 --ignore <int> Ignore the first <int> bp from the 5' end of Read 1 (or single-end alignment
5681 files) when processing the methylation call string. This can remove e.g. a
5682 restriction enzyme site at the start of each read or any other source of
5683 bias (such as PBAT-Seq data).
5684
5685 --ignore_r2 <int> Ignore the first <int> bp from the 5' end of Read 2 of paired-end sequencing
5686 results only. Since the first couple of bases in Read 2 of BS-Seq experiments
5687 show a severe bias towards non-methylation as a result of end-repairing
5688 sonicated fragments with unmethylated cytosines (see M-bias plot), it is
5689 recommended that the first couple of bp of Read 2 are removed before
5690 starting downstream analysis. Please see the section on M-bias plots in the
5691 Bismark User Guide for more details.
5692
5693 --ignore_3prime <int> Ignore the last <int> bp from the 3' end of Read 1 (or single-end alignment
5694 files) when processing the methylation call string. This can remove unwanted
5695 biases from the end of reads.
5696
5697 --ignore_3prime_r2 <int> Ignore the last <int> bp from the 3' end of Read 2 of paired-end sequencing
5698 results only. This can remove unwanted biases from the end of reads.
5699
5700 --comprehensive Specifying this option will merge all four possible strand-specific
5701 methylation info into context-dependent output files. The default
5702
5703 contexts are:
5704 - CpG context
5705 - CHG context
5706 - CHH context
5707
5708 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
5709 strand-specific output files (default) for Cs in
5710 - CpG context
5711 - non-CpG context
5712
5713 --report Prints out a short methylation summary as well as the paramaters used to run
5714 this script. Default: ON.
5715
5716 --no_header Suppresses the Bismark version header line in all output files for more convenient
5717 batch processing.
5718
5719 -o/--output DIR Allows specification of a different output directory (absolute or relative
5720 path). If not specified explicitely, the output will be written to the current directory.
5721
5722 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
5723 explicitly if Samtools is in the PATH already.
5724
5725 --gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in
5726 a GZIP compressed form to save disk space. This option does not work on bedGraph and
5727 genome-wide cytosine reports as they are 'tiny' anyway.
5728
5729 --version Displays version information.
5730
5731 -h/--help Displays this help file and exits.
5732
5733 --mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as
5734 well as a report (optional) and then quit. Default: OFF.
5735
5736 --mbias_off The methylation extractor will process the entire file as usual but doesn't write out any M-bias report.
5737 Only recommended for users who deliberately want to keep an earlier version of the M-bias report.
5738 Default: OFF.
5739
5740 --multicore <int> Sets the number of cores to be used for the methylation extraction process. If system resources
5741 are plentiful this is a viable option to speed up the extraction process (we observed a near linear
5742 speed increase for up to 10 cores used). Please note that a typical process of extracting a BAM file
5743 and writing out '.gz' output streams will in fact use ~3 cores per value of --multicore <int>
5744 specified (1 for the methylation extractor itself, 1 for a Samtools stream, 1 for GZIP stream), so
5745 --multicore 10 is likely to use around 30 cores of system resources. This option has no bearing
5746 on the bismark2bedGraph or genome-wide cytosine report processes.
5747
5748
5749
5750
5751 bedGraph specific options:
5752 ==========================
5753
5754 --bedGraph After finishing the methylation extraction, the methylation output is written into a
5755 sorted bedGraph file that reports the position of a given cytosine and its methylation
5756 state (in %, see details below). The methylation extractor output is temporarily split up into
5757 temporary files, one per chromosome (written into the current directory or folder
5758 specified with -o/--output); these temp files are then used for sorting and deleted
5759 afterwards. By default, only cytosines in CpG context will be sorted. The option
5760 '--CX_context' may be used to report all cytosines irrespective of sequence context
5761 (this will take MUCH longer!). The default folder for temporary files during the sorting
5762 process is the output directory. The bedGraph conversion step is performed by the external
5763 module 'bismark2bedGraph'; this script needs to reside in the same folder as the
5764 bismark_methylation_extractor itself.
5765
5766 --zero_based Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start
5767 and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file,
5768 instead of using 1-based coordinates throughout. Default: OFF.
5769
5770
5771 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide
5772 before its methylation percentage is reported. Default: 1.
5773
5774 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
5775
5776 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
5777 in the experiment irrespective of its sequence context. This applies to both forward and
5778 reverse strands. Please be aware that this option may generate large temporary and output files
5779 and may take a long time to sort (up to many hours). Default: OFF.
5780 (i.e. Default = CpG context only).
5781
5782 --buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information.
5783 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
5784 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
5785 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
5786 Defaults to 2G.
5787
5788 --scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of
5789 scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to
5790 individual chromosome files. These errors were caused by the operating system's limit
5791 of the number of filehandle that can be written to at any one time (typically 1024; to
5792 find out this limit on Linux, type: ulimit -a).
5793 To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort
5794 methylation calls into individual chromosome files. Instead, all input files are
5795 temporarily merged into a single file (unless there is only a single file), and this
5796 file will then be sorted by both chromosome AND position using the Unix sort command.
5797 Please be aware that this option might take a looooong time to complete, depending on
5798 the size of the input files, and the memory you allocate to this process (see --buffer_size).
5799 Nevertheless, it seems to be working.
5800
5801 --ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will
5802 instead use two arrays to sort methylated and unmethylated calls. This may result in a faster
5803 sorting process of very large files, but this comes at the cost of a larger memory footprint
5804 (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB
5805 of RAM). Due to overheads in creating and looping through these arrays it seems that it will
5806 actually be *slower* for small files (few million alignments), and we are currently testing at
5807 which point it is advisable to use this option. Note that --ample_memory is not compatible
5808 with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with).
5809
5810
5811
5812 Genome-wide cytosine methylation report specific options:
5813 =========================================================
5814
5815 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a
5816 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based
5817 chromosome coordinates (zero-based start coords are optional) and reports CpG context only (all
5818 cytosine context is optional). The output considers all Cs on both forward and reverse strands and
5819 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not
5820 covered). The cytosine report conversion step is performed by the external module
5821 'coverage2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor
5822 itself.
5823
5824 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of
5825 its context. This applies to both forward and reverse strands. Please be aware that this will
5826 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
5827 Default: OFF (i.e. Default = CpG context only).
5828
5829 --zero_based Uses 0-based genomic coordinates instead of 1-based coordinates. Default: OFF.
5830
5831 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
5832 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
5833
5834 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files
5835 will be named to include the input filename and the chromosome number.
5836
5837
5838
5839 OUTPUT:
5840
5841 The bismark_methylation_extractor output is in the form:
5842 ========================================================
5843 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call>
5844
5845 * Methylated cytosines receive a '+' orientation,
5846 * Unmethylated cytosines receive a '-' orientation.
5847
5848
5849
5850 The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords):
5851 =========================================================================================================
5852
5853 track type=bedGraph (header line)
5854
5855 <chromosome> <start position> <end position> <methylation percentage>
5856
5857
5858
5859 The coverage output looks like this (tab-delimited, 1-based genomic coords; zero-based half-open coordinates available with '--zero_based'):
5860 ============================================================================================================================================
5861
5862 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated>
5863
5864
5865
5866 The genome-wide cytosine methylation output file is tab-delimited in the following format:
5867 ==========================================================================================
5868 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context>
5869
5870
5871
5872 This script was last modified on 22 April 2015.
5873
5874 HOW_TO
5875 }