comparison old/bismark_methylation_extractor @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
comparison
equal deleted inserted replaced
6:0f8646f22b8d 7:fcadce4d9a06
1 #!/usr/bin/perl
2 use warnings;
3 use strict;
4 $|++;
5 use Getopt::Long;
6 use Cwd;
7 use Carp;
8 use FindBin qw($Bin);
9 use lib "$Bin/../lib";
10
11
12 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
13
14 ## This program is free software: you can redistribute it and/or modify
15 ## it under the terms of the GNU General Public License as published by
16 ## the Free Software Foundation, either version 3 of the License, or
17 ## (at your option) any later version.
18
19 ## This program is distributed in the hope that it will be useful,
20 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ## GNU General Public License for more details.
23
24 ## You should have received a copy of the GNU General Public License
25 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
26
27 my @filenames; # input files
28 my %counting;
29 my $parent_dir = getcwd();
30
31 my %fhs;
32
33 my $version = 'v0.10.1';
34 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem) = process_commandline();
35
36
37 ### only needed for bedGraph output
38 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files
39 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
40 my @bedfiles;
41
42 ### only needed for genome-wide cytosine methylation report
43 my %chromosomes;
44
45 my %mbias_1;
46 my %mbias_2;
47
48 ##############################################################################################
49 ### Summarising Run Parameters
50 ##############################################################################################
51
52 ### METHYLATION EXTRACTOR
53
54 warn "Summarising Bismark methylation extractor parameters:\n";
55 warn '='x63,"\n";
56
57 if ($single){
58 if ($vanilla){
59 warn "Bismark single-end vanilla format specified\n";
60 }
61 else{
62 warn "Bismark single-end SAM format specified (default)\n"; # default
63 }
64 }
65 elsif ($paired){
66 if ($vanilla){
67 warn "Bismark paired-end vanilla format specified\n";
68 }
69 else{
70 warn "Bismark paired-end SAM format specified (default)\n"; # default
71 }
72 }
73
74 if ($single){
75 if ($ignore){
76 warn "First $ignore bp will be disregarded when processing the methylation call string\n";
77 }
78 }
79 else{ ## paired-end
80 if ($ignore){
81 warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n";
82 }
83 if ($ignore_r2){
84 warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n";
85 }
86 }
87
88
89 if ($full){
90 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
91 }
92 if ($merge_non_CpG){
93 warn "Merge CHG and CHH context to non-CpG context specified\n";
94 }
95 ### output directory
96 if ($output_dir eq ''){
97 warn "Output will be written to the current directory ('$parent_dir')\n";
98 }
99 else{
100 warn "Output path specified as: $output_dir\n";
101 }
102
103
104 sleep (1);
105
106 ### BEDGRAPH
107
108 if ($bedGraph){
109 warn "\n\nSummarising bedGraph parameters:\n";
110 warn '='x63,"\n";
111
112 if ($counts){
113 warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage>\ncoverage format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>\n\n";
114 }
115 else{
116 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n";
117 }
118
119 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n";
120
121 if ($CX_context){
122 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n";
123 }
124 else{ # default
125 $CpG_only = 1;
126 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n";
127 }
128
129 if ($remove){
130 warn "White spaces in read ID names will be removed prior to sorting\n";
131 }
132
133 if ($ample_mem){
134 warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n";
135 }
136 elsif (defined $sort_size){
137 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n";
138 }
139 else{
140 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n";
141 }
142
143
144
145 sleep (1);
146
147 if ($cytosine_report){
148 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n";
149 warn '='x63,"\n";
150 warn "Generating comprehensive genome-wide cytosine report\n(output format: <Chromosome> <Position> <Strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> )\n";
151
152
153 if ($CX_context){
154 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n";
155 }
156 else{ # default
157 $CpG_only = 1;
158 warn "Reporting cytosine methylation in CpG context only (default)\n";
159 }
160
161 if ($split_by_chromosome){
162 warn "Splitting the cytosine report output up into individual files for each chromosome\n";
163 }
164
165 ### Zero-based coordinates
166 if ($zero){
167 warn "Using zero-based genomic coordinates (user-defined)\n";
168 }
169 else{ # default, 1-based coords
170 warn "Using 1-based genomic coordinates (default)\n";
171 }
172
173 ### GENOME folder
174 if ($genome_folder){
175 unless ($genome_folder =~/\/$/){
176 $genome_folder =~ s/$/\//;
177 }
178 warn "Genome folder was specified as $genome_folder\n";
179 }
180 else{
181 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/';
182 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n";
183 }
184 sleep (1);
185 }
186 }
187
188 warn "\n";
189 sleep (5);
190
191 ######################################################
192 ### PROCESSING FILES
193 ######################################################
194
195 foreach my $filename (@filenames){
196 # resetting counters and filehandles
197 %fhs = ();
198 %counting =(
199 total_meCHG_count => 0,
200 total_meCHH_count => 0,
201 total_meCpG_count => 0,
202 total_unmethylated_CHG_count => 0,
203 total_unmethylated_CHH_count => 0,
204 total_unmethylated_CpG_count => 0,
205 sequences_count => 0,
206 );
207
208 @sorting_files = ();
209 @bedfiles = ();
210
211 %mbias_1 = ();
212 %mbias_2 = ();
213
214 ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor
215 unless ($vanilla){
216 if ($paired){
217 test_positional_sorting($filename);
218 }
219 }
220
221 process_Bismark_results_file($filename);
222
223 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues
224 foreach my $fh (keys %fhs) {
225 if ($fh =~ /^[1230]$/) {
226 foreach my $context (keys %{$fhs{$fh}}) {
227 close $fhs{$fh}->{$context} or die $!;
228 }
229 }
230 else{
231 close $fhs{$fh} or die $!;
232 }
233 }
234
235 ### printing out all M-Bias data
236 produce_mbias_plots ($filename);
237
238 delete_unused_files();
239
240 if ($bedGraph){
241
242 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
243 $out =~ s/gz$//;
244 $out =~ s/sam$//;
245 $out =~ s/bam$//;
246 $out =~ s/txt$//;
247 $out =~ s/$/bedGraph/;
248
249 my $bedGraph_output = $out;
250 my @args;
251
252 if ($remove){
253 push @args, '--remove';
254 }
255 if ($CX_context){
256 push @args, '--CX_context';
257 }
258 if ($no_header){
259 push @args, '--no_header';
260 }
261 if ($gazillion){
262 push @args, '--gazillion';
263 }
264 if ($ample_mem){
265 push @args, '--ample_memory';
266 }
267
268
269 # if ($counts){
270 # push @args, "--counts";
271 # }
272
273 push @args, "--buffer_size $sort_size";
274 push @args, "--cutoff $coverage_threshold";
275 push @args, "--output $bedGraph_output";
276 push @args, "--dir '$output_dir'";
277
278 ### adding all files to be sorted to @args
279 foreach my $f (@sorting_files){
280 push @args, $f;
281 }
282
283 # print join "\t",@args,"\n";
284
285 system ("$Bin/bismark2bedGraph @args");
286
287 warn "Finished BedGraph conversion ...\n\n";
288 sleep(3);
289
290 # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!";
291 # warn "Writing bedGraph to file: $bedGraph_output\n";
292 # process_bedGraph_output();
293 # close OUT or die $!;
294
295 ### genome-wide cytosine methylation report requires bedGraph processing anyway
296 if ($cytosine_report){
297
298 @args = (); # resetting @args
299 my $cytosine_out = $out;
300 $cytosine_out =~ s/bedGraph$//;
301
302 if ($CX_context){
303 $cytosine_out =~ s/$/CX_report.txt/;
304 }
305 else{
306 $cytosine_out =~ s/$/CpG_report.txt/;
307 }
308
309 push @args, "--output $cytosine_out";
310 push @args, "--dir '$output_dir'";
311 push @args, "--genome '$genome_folder'";
312 push @args, "--parent_dir '$parent_dir'";
313
314 if ($zero){
315 push @args, "--zero";
316 }
317 if ($CX_context){
318 push @args, '--CX_context';
319 }
320 if ($split_by_chromosome){
321 push @args, '--split_by_chromosome';
322 }
323
324 my $coverage_output = $bedGraph_output;
325 $coverage_output =~ s/bedGraph$/bismark.cov/;
326
327 push @args, $output_dir . $coverage_output; # this will be the infile
328
329 system ("$Bin/coverage2cytosine @args");
330 # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out);
331 warn "\n\nFinished generating genome-wide cytosine report\n\n";
332 }
333 }
334 }
335
336 sub delete_unused_files{
337
338 warn "Deleting unused files ...\n\n"; sleep(1);
339
340 my $index = 0;
341
342 while ($index <= $#sorting_files){
343 if ($sorting_files[$index] =~ /gz$/){
344 open (USED,"zcat $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
345 }
346 else{
347 open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
348 }
349
350 my $used = 0;
351
352 while (<USED>){
353 next if (/^Bismark/);
354 if ($_){
355 $used = 1;
356 last;
357 }
358 }
359
360 if ($used){
361 warn "$sorting_files[$index] contains data ->\tkept\n";
362 ++$index;
363 }
364 else{
365
366 my $delete = unlink $sorting_files[$index];
367
368 if ($delete){
369 warn "$sorting_files[$index] was empty ->\tdeleted\n";
370 }
371 else{
372 warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n"
373 }
374
375 ### we also need to remove the element from @sorting_files
376 splice @sorting_files, $index, 1;
377 }
378 }
379 warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately)
380 }
381
382 sub produce_mbias_plots{
383
384 my $filename = shift;
385
386 my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
387 $mbias =~ s/gz$//;
388 $mbias =~ s/sam$//;
389 $mbias =~ s/bam$//;
390 $mbias =~ s/txt$//;
391 my $mbias_graph_1 = my $mbias_graph_2 = $mbias;
392 $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png';
393 $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png';
394
395 $mbias =~ s/$/M-bias.txt/;
396
397 open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n";
398
399 # determining maximum read length
400 my $max_length_1 = 0;
401 my $max_length_2 = 0;
402
403 foreach my $context (keys %mbias_1){
404 foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){
405 $max_length_1 = $pos unless ($max_length_1 >= $pos);
406 }
407 }
408 if ($paired){
409 foreach my $context (keys %mbias_2){
410 foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){
411 $max_length_2 = $pos unless ($max_length_2 >= $pos);
412 }
413 }
414 }
415
416 if ($single){
417 warn "Determining maximum read length for M-Bias plot\n";
418 warn "Maximum read length of Read 1: $max_length_1\n\n";
419 }
420 else{
421 warn "Determining maximum read lengths for M-Bias plots\n";
422 warn "Maximum read length of Read 1: $max_length_1\n";
423 warn "Maximum read length of Read 2: $max_length_2\n\n";
424 }
425 # sleep(3);
426
427 my @mbias_read1;
428 my @mbias_read2;
429
430 #Check whether the module GD::Graph:lines is installed
431 my $gd_graph_installed = 0;
432 eval{
433 require GD::Graph::lines;
434 GD::Graph::lines->import();
435 };
436
437 unless($@) { # syntax or routine error variable, set if something goes wron in the last eval{ require ...}
438 $gd_graph_installed = 1;
439
440 #Check whether the module GD::Graph::colour is installed
441 eval{
442 require GD::Graph::colour;
443 GD::Graph::colour->import(qw(:colours :lists :files :convert));
444 };
445
446 if ($@) {
447 warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
448 sleep(2);
449 $gd_graph_installed = 0;
450 }
451
452
453 }
454 else{
455 warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
456 sleep(2);
457 }
458
459
460 my $graph_title;
461 my $graph1;
462 my $graph2;
463
464 if ( $gd_graph_installed){
465 $graph1 = GD::Graph::lines->new(800,600);
466 if ($paired){
467 $graph2 = GD::Graph::lines->new(800,600);
468 }
469 }
470
471 foreach my $context (qw(CpG CHG CHH)){
472 @{$mbias_read1[0]} = ();
473
474 if ($paired){
475 print MBIAS "$context context (R1)\n================\n";
476 $graph_title = 'M-bias (Read 1)';
477 }
478 else{
479 print MBIAS "$context context\n===========\n";
480 $graph_title = 'M-bias';
481 }
482 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
483
484 foreach my $pos (1..$max_length_1){
485
486 unless (defined $mbias_1{$context}->{$pos}->{meth}){
487 $mbias_1{$context}->{$pos}->{meth} = 0;
488 }
489 unless (defined $mbias_1{$context}->{$pos}->{un}){
490 $mbias_1{$context}->{$pos}->{un} = 0;
491 }
492
493 my $percent = '';
494 if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){
495 $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) );
496 }
497 my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth};
498
499 print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n";
500 push @{$mbias_read1[0]},$pos;
501
502 if ($context eq 'CpG'){
503 push @{$mbias_read1[1]},$percent;
504 push @{$mbias_read1[4]},$coverage;
505 }
506 elsif ($context eq 'CHG'){
507 push @{$mbias_read1[2]},$percent;
508 push @{$mbias_read1[5]},$coverage;
509 }
510 elsif ($context eq 'CHH'){
511 push @{$mbias_read1[3]},$percent;
512 push @{$mbias_read1[6]},$coverage;
513 }
514 }
515 print MBIAS "\n";
516 }
517
518 if ( $gd_graph_installed){
519
520 add_colour(nice_blue => [31,120,180]);
521 add_colour(nice_orange => [255,127,0]);
522 add_colour(nice_green => [51,160,44]);
523 add_colour(pale_blue => [153,206,227]);
524 add_colour(pale_orange => [253,204,138]);
525 add_colour(pale_green => [191,230,207]);
526
527 $graph1->set(
528 x_label => 'position (bp)',
529 y1_label => '% methylation',
530 y2_label => '# methylation calls',
531 title => $graph_title,
532 line_width => 2,
533 x_max_value => $max_length_1,
534 x_min_value => 0,
535 y_tick_number => 10,
536 y_label_skip => 2,
537 y1_max_value => 100,
538 y1_min_value => 0,
539 y_label_skip => 2,
540 y2_min_value => 0,
541 x_label_skip => 5,
542 x_label_position => 0.5,
543 x_tick_offset => -1,
544 bgclr => 'white',
545 transparent => 0,
546 two_axes => 1,
547 use_axis => [1,1,1,2,2,2],
548 legend_placement => 'RC',
549 legend_spacing => 6,
550 legend_marker_width => 24,
551 legend_marker_height => 18,
552 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
553 ) or die $graph1->error;
554
555 $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
556
557 my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error;
558
559 open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n";
560 binmode MBIAS_G1;
561 print MBIAS_G1 $gd1->png;
562 }
563
564 if ($paired){
565
566 foreach my $context (qw(CpG CHG CHH)){
567 @{$mbias_read2[0]} = ();
568
569 print MBIAS "$context context (R2)\n================\n";
570 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
571 foreach my $pos (1..$max_length_2){
572
573 unless (defined $mbias_2{$context}->{$pos}->{meth}){
574 $mbias_2{$context}->{$pos}->{meth} = 0;
575 }
576 unless (defined $mbias_2{$context}->{$pos}->{un}){
577 $mbias_2{$context}->{$pos}->{un} = 0;
578 }
579
580 my $percent = '';
581 if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){
582 $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) );
583 }
584 my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth};
585
586 print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n";
587
588 push @{$mbias_read2[0]},$pos;
589
590 if ($context eq 'CpG'){
591 push @{$mbias_read2[1]},$percent;
592 push @{$mbias_read2[4]},$coverage;
593 }
594 elsif ($context eq 'CHG'){
595 push @{$mbias_read2[2]},$percent;
596 push @{$mbias_read2[5]},$coverage;
597 }
598 elsif ($context eq 'CHH'){
599 push @{$mbias_read2[3]},$percent;
600 push @{$mbias_read2[6]},$coverage;
601 }
602 }
603 print MBIAS "\n";
604 }
605
606 if ( $gd_graph_installed){
607
608 add_colour(nice_blue => [31,120,180]);
609 add_colour(nice_orange => [255,127,0]);
610 add_colour(nice_green => [51,160,44]);
611 add_colour(pale_blue => [153,206,227]);
612 add_colour(pale_orange => [253,204,138]);
613 add_colour(pale_green => [191,230,207]);
614
615 $graph2->set(
616 x_label => 'position (bp)',
617 line_width => 2,
618 x_max_value => $max_length_1,
619 x_min_value => 0,
620 y_tick_number => 10,
621 y_label_skip => 2,
622 y1_max_value => 100,
623 y1_min_value => 0,
624 y_label_skip => 2,
625 y2_min_value => 0,
626 x_label_skip => 5,
627 x_label_position => 0.5,
628 x_tick_offset => -1,
629 bgclr => 'white',
630 transparent => 0,
631 two_axes => 1,
632 use_axis => [1,1,1,2,2,2],
633 legend_placement => 'RC',
634 legend_spacing => 6,
635 legend_marker_width => 24,
636 legend_marker_height => 18,
637 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
638 x_label => 'position (bp)',
639 y1_label => '% methylation',
640 y2_label => '# calls',
641 title => 'M-bias (Read 2)',
642 ) or die $graph2->error;
643
644 $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
645 my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error;
646
647 open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n";
648 binmode MBIAS_G2;
649 print MBIAS_G2 $gd2->png;
650
651 }
652 }
653 }
654
655 sub process_commandline{
656 my $help;
657 my $single_end;
658 my $paired_end;
659 my $ignore;
660 my $ignore_r2;
661 my $genomic_fasta;
662 my $full;
663 my $report;
664 my $extractor_version;
665 my $no_overlap;
666 my $merge_non_CpG;
667 my $vanilla;
668 my $output_dir;
669 my $no_header;
670 my $bedGraph;
671 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status
672 my $remove;
673 my $counts;
674 my $cytosine_report;
675 my $genome_folder;
676 my $zero;
677 my $CpG_only;
678 my $CX_context;
679 my $split_by_chromosome;
680 my $sort_size;
681 my $samtools_path;
682 my $gzip;
683 my $mbias_only;
684 my $gazillion;
685 my $ample_mem;
686
687 my $command_line = GetOptions ('help|man' => \$help,
688 'p|paired-end' => \$paired_end,
689 's|single-end' => \$single_end,
690 'fasta' => \$genomic_fasta,
691 'ignore=i' => \$ignore,
692 'ignore_r2=i' => \$ignore_r2,
693 'comprehensive' => \$full,
694 'report' => \$report,
695 'version' => \$extractor_version,
696 'no_overlap' => \$no_overlap,
697 'merge_non_CpG' => \$merge_non_CpG,
698 'vanilla' => \$vanilla,
699 'o|output=s' => \$output_dir,
700 'no_header' => \$no_header,
701 'bedGraph' => \$bedGraph,
702 "cutoff=i" => \$coverage_threshold,
703 "remove_spaces" => \$remove,
704 "counts" => \$counts,
705 "cytosine_report" => \$cytosine_report,
706 'g|genome_folder=s' => \$genome_folder,
707 "zero_based" => \$zero,
708 "CX|CX_context" => \$CX_context,
709 "split_by_chromosome" => \$split_by_chromosome,
710 "buffer_size=s" => \$sort_size,
711 'samtools_path=s' => \$samtools_path,
712 "gzip" => \$gzip,
713 "mbias_only" => \$mbias_only,
714 "gazillion|scaffolds" => \$gazillion,
715 "ample_memory" => \$ample_mem,
716 );
717
718 ### EXIT ON ERROR if there were errors with any of the supplied options
719 unless ($command_line){
720 die "Please respecify command line options\n";
721 }
722
723 ### HELPFILE
724 if ($help){
725 print_helpfile();
726 exit;
727 }
728
729 if ($extractor_version){
730 print << "VERSION";
731
732
733 Bismark Methylation Extractor
734
735 Bismark Extractor Version: $version
736 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
737 www.bioinformatics.babraham.ac.uk/projects/bismark/
738
739
740 VERSION
741 exit;
742 }
743
744
745 ### no files provided
746 unless (@ARGV){
747 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
748 }
749 @filenames = @ARGV;
750
751 warn "\n *** Bismark methylation extractor version $version ***\n\n";
752
753 ### M-BIAS ONLY
754 if ($mbias_only){
755 if ($bedGraph){
756 die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n";
757 }
758 if ($cytosine_report){
759 die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n";
760 }
761 if ($merge_non_CpG){
762 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n";
763 }
764 if ($full){
765 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n";
766 }
767 sleep(3);
768 }
769
770 ### PRINT A REPORT
771 unless ($report){
772 $report = 0;
773 }
774
775 ### OUTPUT DIR PATH
776 if ($output_dir){
777 unless ($output_dir =~ /\/$/){
778 $output_dir =~ s/$/\//;
779 }
780 }
781 else{
782 $output_dir = '';
783 }
784
785 ### NO HEADER
786 unless ($no_header){
787 $no_header = 0;
788 }
789
790 ### OLD (VANILLA) OUTPUT FORMAT
791 unless ($vanilla){
792 $vanilla = 0;
793 }
794
795 if ($single_end){
796 $paired_end = 0; ### SINGLE END ALIGNMENTS
797 }
798 elsif ($paired_end){
799 $single_end = 0; ### PAIRED-END ALIGNMENTS
800 }
801 else{
802
803 ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header
804
805 if ($vanilla){
806 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n";
807 }
808 else{ # SAM/BAM format
809
810 my $file = $filenames[0];
811 warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1);
812
813 ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file
814 if ($file =~ /\.gz$/){
815 open (DETERMINE,"zcat $file |") or die "Unable to read from gzipped file $file: $!\n";
816 }
817 elsif ($file =~ /\.bam$/ || `file -b $file` =~ /^gzip/){
818 open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM file $file: $!\n";
819 }
820 else{
821 open (DETERMINE,$file) or die "Unable to read from $file: $!\n";
822 }
823
824 while (<DETERMINE>){
825 last unless (/^\@/);
826 if ($_ =~ /^\@PG/){
827 # warn "found the \@PG line:\n";
828 # warn "$_";
829
830 if ($_ =~ /-1/ and $_ =~ /-2/){
831 warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1);
832 $paired_end = 1;
833 $single_end = 0;
834 }
835 else{
836 warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1);
837 $paired_end = 0;
838 $single_end = 1;
839 }
840 }
841 }
842
843 close DETERMINE or warn $!;
844
845 }
846 }
847
848 ### IGNORING <INT> bases at the start of the read when processing the methylation call string
849 unless ($ignore){
850 $ignore = 0;
851 }
852
853 if (defined $ignore_r2){
854 die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end);
855 }
856 else{
857 $ignore_r2 = 0;
858 }
859
860
861 ### NO OVERLAP
862 if ($no_overlap){
863 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
864 }
865 else{
866 $no_overlap = 0;
867 }
868
869 ### COMPREHENSIVE OUTPUT
870 unless ($full){
871 $full = 0;
872 }
873
874 ### MERGE NON-CpG context
875 unless ($merge_non_CpG){
876 $merge_non_CpG = 0;
877 }
878
879 ### remove white spaces in read ID (needed for sorting using the sort command
880 unless ($remove){
881 $remove = 0;
882 }
883
884 ### COVERAGE THRESHOLD FOR bedGraph OUTPUT
885 if (defined $coverage_threshold){
886 unless ($coverage_threshold > 0){
887 die "Please select a coverage greater than 0 (positive integers only)\n";
888 }
889 }
890 else{
891 $coverage_threshold = 1;
892 }
893
894 ### SORT buffer size
895 if (defined $sort_size){
896 unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){
897 die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n";
898 }
899 }
900 else{
901 $sort_size = '2G';
902 }
903
904 if ($zero){
905 die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report);
906 }
907
908 if ($CX_context){
909 die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
910 }
911 else{
912 $CX_context = 0;
913 }
914
915 unless ($counts){
916 $counts = 1; # counts will always be set
917 }
918
919 if ($cytosine_report){
920
921 ### GENOME folder
922 if ($genome_folder){
923 unless ($genome_folder =~/\/$/){
924 $genome_folder =~ s/$/\//;
925 }
926 }
927 else{
928 die "Please specify a genome folder to proceed (full path only)\n";
929 }
930
931 unless ($bedGraph){
932 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n";
933 $bedGraph = 1;
934 }
935 unless ($counts){
936 # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n";
937 $counts = 1;
938 }
939 warn "\n";
940 }
941
942 ### PATH TO SAMTOOLS
943 if (defined $samtools_path){
944 # if Samtools was specified as full command
945 if ($samtools_path =~ /samtools$/){
946 if (-e $samtools_path){
947 # Samtools executable found
948 }
949 else{
950 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
951 }
952 }
953 else{
954 unless ($samtools_path =~ /\/$/){
955 $samtools_path =~ s/$/\//;
956 }
957 $samtools_path .= 'samtools';
958 if (-e $samtools_path){
959 # Samtools executable found
960 }
961 else{
962 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
963 }
964 }
965 }
966 # Check whether Samtools is in the PATH if no path was supplied by the user
967 else{
968 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
969 $samtools_path = `which samtools`;
970 chomp $samtools_path;
971 }
972 }
973
974 unless (defined $samtools_path){
975 $samtools_path = '';
976 }
977
978
979 if ($gazillion){
980 if ($ample_mem){
981 die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n";
982 }
983 }
984
985 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem);
986 }
987
988
989 sub test_positional_sorting{
990
991 my $filename = shift;
992
993 print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t";
994 sleep(1);
995
996 if ($filename =~ /\.gz$/) {
997 open (TEST,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
998 }
999 elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) {
1000 if ($samtools_path){
1001 open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
1002 }
1003 else{
1004 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
1005 }
1006 }
1007 else {
1008 open (TEST,$filename) or die "Can't open file $filename: $!\n";
1009 }
1010
1011 my $count = 0;
1012
1013 while (<TEST>) {
1014 if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted)
1015 if (/^\@SO/) {
1016 die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n";
1017 }
1018 next;
1019 }
1020 $count++;
1021
1022 last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID
1023
1024 my ($id_1) = (split (/\t/));
1025
1026 ### reading the next line which should be read 2
1027 $_ = <TEST>;
1028 my ($id_2) = (split (/\t/));
1029 last unless ($id_2);
1030 ++$count;
1031
1032 if ($id_1 eq $id_2){
1033 ### ids are the same
1034 next;
1035 }
1036 else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first
1037 my $id_1_trunc = $id_1;
1038 $id_1_trunc =~ s/\/1$//;
1039 my $id_2_trunc = $id_2;
1040 $id_2_trunc =~ s/\/2$//;
1041
1042 unless ($id_1_trunc eq $id_2_trunc){
1043 die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n";
1044 }
1045 }
1046 }
1047 # close TEST or die $!; somehow fails on our cluster...
1048 ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other)
1049 warn "...passed!\n";
1050 sleep(1);
1051
1052 }
1053
1054
1055 sub process_Bismark_results_file{
1056 my $filename = shift;
1057
1058 warn "\nNow reading in Bismark result file $filename\n\n";
1059
1060 if ($filename =~ /\.gz$/) {
1061 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
1062 }
1063 elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) {
1064 if ($samtools_path){
1065 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
1066 }
1067 else{
1068 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
1069 }
1070 }
1071 else {
1072 open (IN,$filename) or die "Can't open file $filename: $!\n";
1073 }
1074
1075 ### Vanilla and SAM output need to read different numbers of header lines
1076 if ($vanilla) {
1077 my $bismark_version = <IN>; ## discarding the Bismark version info
1078 chomp $bismark_version;
1079 $bismark_version =~ s/\r//; # replaces \r line feed
1080 $bismark_version =~ s/Bismark version: //;
1081 if ($bismark_version =~ /^\@/) {
1082 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
1083 sleep (2);
1084 }
1085
1086 unless ($version eq $bismark_version){
1087 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
1088 }
1089 } else {
1090 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
1091 # We are reading from it further down
1092 }
1093
1094 my $output_filename = (split (/\//,$filename))[-1];
1095
1096 ### OPENING OUTPUT-FILEHANDLES
1097 if ($report) {
1098 my $report_filename = $output_filename;
1099 $report_filename =~ s/\.sam$//;
1100 $report_filename =~ s/\.txt$//;
1101 $report_filename =~ s/$/_splitting_report.txt/;
1102 $report_filename = $output_dir . $report_filename;
1103 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
1104 }
1105
1106 if ($report) {
1107 print REPORT "$output_filename\n\n";
1108 print REPORT "Parameters used to extract methylation information:\n";
1109 if ($paired) {
1110 if ($vanilla) {
1111 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
1112 } else {
1113 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
1114 }
1115 }
1116
1117 if ($single) {
1118 if ($vanilla) {
1119 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
1120 } else {
1121 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
1122 }
1123 }
1124 if ($single){
1125 if ($ignore) {
1126 print REPORT "Ignoring first $ignore bp\n";
1127 }
1128 }
1129 else{ # paired-end
1130 if ($ignore) {
1131 print REPORT "Ignoring first $ignore bp of Read 1\n";
1132 }
1133 if ($ignore_r2){
1134 print REPORT "Ignoring first $ignore_r2 bp of Read 2\n";
1135 }
1136 }
1137
1138 if ($full) {
1139 print REPORT "Output specified: comprehensive\n";
1140 } else {
1141 print REPORT "Output specified: strand-specific (default)\n";
1142 }
1143
1144 if ($no_overlap) {
1145 print REPORT "No overlapping methylation calls specified\n";
1146 }
1147 if ($genomic_fasta) {
1148 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
1149 }
1150 if ($merge_non_CpG) {
1151 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
1152 }
1153
1154 print REPORT "\n";
1155 }
1156
1157 ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
1158
1159 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
1160 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
1161 if ($full and $merge_non_CpG) {
1162 my $cpg_output = my $other_c_output = $output_filename;
1163 ### C in CpG context
1164 $cpg_output =~ s/^/CpG_context_/;
1165 $cpg_output =~ s/sam$/txt/;
1166 $cpg_output =~ s/bam$/txt/;
1167 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
1168 $cpg_output = $output_dir . $cpg_output;
1169
1170 if ($gzip){
1171 $cpg_output .= '.gz';
1172 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
1173 }
1174 else{
1175 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
1176 }
1177
1178 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
1179 push @sorting_files,$cpg_output;
1180
1181 unless ($no_header) {
1182 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1183 }
1184
1185 ### C in any other context than CpG
1186 $other_c_output =~ s/^/Non_CpG_context_/;
1187 $other_c_output =~ s/sam$/txt/;
1188 $other_c_output =~ s/bam$/txt/;
1189 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
1190 $other_c_output = $output_dir . $other_c_output;
1191
1192 if ($gzip){
1193 $other_c_output .= '.gz';
1194 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only);
1195 }
1196 else{
1197 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only);
1198 }
1199
1200 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only);
1201 push @sorting_files,$other_c_output;
1202
1203
1204 unless ($no_header) {
1205 print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1206 }
1207 }
1208
1209 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
1210 elsif ($merge_non_CpG) {
1211
1212 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
1213
1214 ### For cytosines in CpG context
1215 $cpg_ot =~ s/^/CpG_OT_/;
1216 $cpg_ot =~ s/sam$/txt/;
1217 $cpg_ot =~ s/bam$/txt/;
1218 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
1219 $cpg_ot = $output_dir . $cpg_ot;
1220
1221 if ($gzip){
1222 $cpg_ot .= '.gz';
1223 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
1224 }
1225 else{
1226 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
1227 }
1228
1229 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
1230 push @sorting_files,$cpg_ot;
1231
1232 unless($no_header){
1233 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1234 }
1235
1236 $cpg_ctot =~ s/^/CpG_CTOT_/;
1237 $cpg_ctot =~ s/sam$/txt/;
1238 $cpg_ctot =~ s/bam$/txt/;
1239 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
1240 $cpg_ctot = $output_dir . $cpg_ctot;
1241
1242 if ($gzip){
1243 $cpg_ctot .= '.gz';
1244 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
1245 }
1246 else{
1247 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
1248 }
1249
1250 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
1251 push @sorting_files,$cpg_ctot;
1252
1253 unless($no_header){
1254 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1255 }
1256
1257 $cpg_ctob =~ s/^/CpG_CTOB_/;
1258 $cpg_ctob =~ s/sam$/txt/;
1259 $cpg_ctob =~ s/bam$/txt/;
1260 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
1261 $cpg_ctob = $output_dir . $cpg_ctob;
1262
1263 if ($gzip){
1264 $cpg_ctob .= '.gz';
1265 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
1266 }
1267 else{
1268 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
1269 }
1270
1271 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
1272 push @sorting_files,$cpg_ctob;
1273
1274 unless($no_header){
1275 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1276 }
1277
1278 $cpg_ob =~ s/^/CpG_OB_/;
1279 $cpg_ob =~ s/sam$/txt/;
1280 $cpg_ob =~ s/bam$/txt/;
1281 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
1282 $cpg_ob = $output_dir . $cpg_ob;
1283
1284 if ($gzip){
1285 $cpg_ob .= '.gz';
1286 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
1287 }
1288 else{
1289 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
1290 }
1291
1292 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
1293 push @sorting_files,$cpg_ob;
1294
1295 unless($no_header){
1296 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1297 }
1298
1299 ### For cytosines in Non-CpG (CC, CT or CA) context
1300 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
1301
1302 $other_c_ot =~ s/^/Non_CpG_OT_/;
1303 $other_c_ot =~ s/sam$/txt/;
1304 $other_c_ot =~ s/bam$/txt/;
1305 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
1306 $other_c_ot = $output_dir . $other_c_ot;
1307
1308 if ($gzip){
1309 $other_c_ot .= '.gz';
1310 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
1311 }
1312 else{
1313 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
1314 }
1315
1316 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only);
1317 push @sorting_files,$other_c_ot;
1318
1319 unless($no_header){
1320 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1321 }
1322
1323 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
1324 $other_c_ctot =~ s/sam$/txt/;
1325 $other_c_ctot =~ s/bam$/txt/;
1326 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
1327 $other_c_ctot = $output_dir . $other_c_ctot;
1328
1329 if ($gzip){
1330 $other_c_ctot .= '.gz';
1331 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
1332 }
1333 else{
1334 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
1335 }
1336
1337 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only);
1338 push @sorting_files,$other_c_ctot;
1339
1340 unless($no_header){
1341 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1342 }
1343
1344 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
1345 $other_c_ctob =~ s/sam$/txt/;
1346 $other_c_ctob =~ s/bam$/txt/;
1347 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
1348 $other_c_ctob = $output_dir . $other_c_ctob;
1349
1350 if ($gzip){
1351 $other_c_ctob .= '.gz';
1352 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
1353 }
1354 else{
1355 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
1356 }
1357
1358 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only);
1359 push @sorting_files,$other_c_ctob;
1360
1361 unless($no_header){
1362 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1363 }
1364
1365 $other_c_ob =~ s/^/Non_CpG_OB_/;
1366 $other_c_ob =~ s/sam$/txt/;
1367 $other_c_ob =~ s/sam$/txt/;
1368 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
1369 $other_c_ob = $output_dir . $other_c_ob;
1370
1371 if ($gzip){
1372 $other_c_ob .= '.gz';
1373 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
1374 }
1375 else{
1376 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
1377 }
1378
1379 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only);
1380 push @sorting_files,$other_c_ob;
1381
1382 unless($no_header){
1383 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1384 }
1385 }
1386 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
1387
1388 ### if --comprehensive was specified we are only writing one file per context
1389 elsif ($full) {
1390 my $cpg_output = my $chg_output = my $chh_output = $output_filename;
1391 ### C in CpG context
1392 $cpg_output =~ s/^/CpG_context_/;
1393 $cpg_output =~ s/sam$/txt/;
1394 $cpg_output =~ s/bam$/txt/;
1395 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
1396 $cpg_output = $output_dir . $cpg_output;
1397
1398 if ($gzip){
1399 $cpg_output .= '.gz';
1400 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
1401 }
1402 else{
1403 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
1404 }
1405
1406 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
1407 push @sorting_files,$cpg_output;
1408
1409 unless($no_header){
1410 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1411 }
1412
1413 ### C in CHG context
1414 $chg_output =~ s/^/CHG_context_/;
1415 $chg_output =~ s/sam$/txt/;
1416 $chg_output =~ s/bam$/txt/;
1417 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
1418 $chg_output = $output_dir . $chg_output;
1419
1420 if ($gzip){
1421 $chg_output .= '.gz';
1422 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only);
1423 }
1424 else{
1425 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only);
1426 }
1427
1428 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only);
1429 push @sorting_files,$chg_output;
1430
1431 unless($no_header){
1432 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1433 }
1434
1435 ### C in CHH context
1436 $chh_output =~ s/^/CHH_context_/;
1437 $chh_output =~ s/sam$/txt/;
1438 $chh_output =~ s/bam$/txt/;
1439 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
1440 $chh_output = $output_dir . $chh_output;
1441
1442 if ($gzip){
1443 $chh_output .= '.gz';
1444 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only);
1445 }
1446 else{
1447 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only);
1448 }
1449
1450 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only);
1451 push @sorting_files, $chh_output;
1452
1453 unless($no_header){
1454 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1455 }
1456 }
1457 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
1458 else {
1459 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
1460
1461 ### For cytosines in CpG context
1462 $cpg_ot =~ s/^/CpG_OT_/;
1463 $cpg_ot =~ s/sam$/txt/;
1464 $cpg_ot =~ s/bam$/txt/;
1465 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
1466 $cpg_ot = $output_dir . $cpg_ot;
1467
1468 if ($gzip){
1469 $cpg_ot .= '.gz';
1470 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
1471 }
1472 else{
1473 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
1474 }
1475
1476 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
1477 push @sorting_files,$cpg_ot;
1478
1479 unless($no_header){
1480 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1481 }
1482
1483 $cpg_ctot =~ s/^/CpG_CTOT_/;
1484 $cpg_ctot =~ s/sam$/txt/;
1485 $cpg_ctot =~ s/bam$/txt/;
1486 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
1487 $cpg_ctot = $output_dir . $cpg_ctot;
1488
1489 if ($gzip){
1490 $cpg_ctot .= '.gz';
1491 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
1492 }
1493 else{
1494 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
1495 }
1496
1497 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
1498 push @sorting_files,$cpg_ctot;
1499
1500 unless($no_header){
1501 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1502 }
1503
1504 $cpg_ctob =~ s/^/CpG_CTOB_/;
1505 $cpg_ctob =~ s/sam$/txt/;
1506 $cpg_ctob =~ s/bam$/txt/;
1507 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
1508 $cpg_ctob = $output_dir . $cpg_ctob;
1509
1510 if ($gzip){
1511 $cpg_ctob .= '.gz';
1512 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
1513 }
1514 else{
1515 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
1516 }
1517
1518 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
1519 push @sorting_files,$cpg_ctob;
1520
1521 unless($no_header){
1522 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1523 }
1524
1525 $cpg_ob =~ s/^/CpG_OB_/;
1526 $cpg_ob =~ s/sam$/txt/;
1527 $cpg_ob =~ s/bam$/txt/;
1528 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
1529 $cpg_ob = $output_dir . $cpg_ob;
1530
1531 if ($gzip){
1532 $cpg_ob .= '.gz';
1533 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
1534 }
1535 else{
1536 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
1537 }
1538
1539 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
1540 push @sorting_files,$cpg_ob;
1541
1542 unless($no_header){
1543 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1544 }
1545
1546 ### For cytosines in CHG context
1547 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
1548
1549 $chg_ot =~ s/^/CHG_OT_/;
1550 $chg_ot =~ s/sam$/txt/;
1551 $chg_ot =~ s/bam$/txt/;
1552 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
1553 $chg_ot = $output_dir . $chg_ot;
1554
1555 if ($gzip){
1556 $chg_ot .= '.gz';
1557 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
1558 }
1559 else{
1560 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
1561 }
1562
1563 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only);
1564 push @sorting_files,$chg_ot;
1565
1566 unless($no_header){
1567 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1568 }
1569
1570 $chg_ctot =~ s/^/CHG_CTOT_/;
1571 $chg_ctot =~ s/sam$/txt/;
1572 $chg_ctot =~ s/bam$/txt/;
1573 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
1574 $chg_ctot = $output_dir . $chg_ctot;
1575
1576 if ($gzip){
1577 $chg_ctot .= '.gz';
1578 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
1579 }
1580 else{
1581 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
1582 }
1583
1584 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only);
1585 push @sorting_files,$chg_ctot;
1586
1587 unless($no_header){
1588 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1589 }
1590
1591 $chg_ctob =~ s/^/CHG_CTOB_/;
1592 $chg_ctob =~ s/sam$/txt/;
1593 $chg_ctob =~ s/bam$/txt/;
1594 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
1595 $chg_ctob = $output_dir . $chg_ctob;
1596
1597 if ($gzip){
1598 $chg_ctob .= '.gz';
1599 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
1600 }
1601 else{
1602 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
1603 }
1604
1605 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only);
1606 push @sorting_files,$chg_ctob;
1607
1608 unless($no_header){
1609 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1610 }
1611
1612 $chg_ob =~ s/^/CHG_OB_/;
1613 $chg_ob =~ s/sam$/txt/;
1614 $chg_ob =~ s/bam$/txt/;
1615 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
1616 $chg_ob = $output_dir . $chg_ob;
1617
1618 if ($gzip){
1619 $chg_ob .= '.gz';
1620 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
1621 }
1622 else{
1623 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
1624 }
1625
1626 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only);
1627 push @sorting_files,$chg_ob;
1628
1629 unless($no_header){
1630 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1631 }
1632
1633 ### For cytosines in CHH context
1634 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
1635
1636 $chh_ot =~ s/^/CHH_OT_/;
1637 $chh_ot =~ s/sam$/txt/;
1638 $chh_ot =~ s/bam$/txt/;
1639 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
1640 $chh_ot = $output_dir . $chh_ot;
1641
1642 if ($gzip){
1643 $chh_ot .= '.gz';
1644 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
1645 }
1646 else{
1647 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
1648 }
1649
1650 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only);
1651 push @sorting_files,$chh_ot;
1652
1653 unless($no_header){
1654 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1655 }
1656
1657 $chh_ctot =~ s/^/CHH_CTOT_/;
1658 $chh_ctot =~ s/sam$/txt/;
1659 $chh_ctot =~ s/bam$/txt/;
1660 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
1661 $chh_ctot = $output_dir . $chh_ctot;
1662
1663 if ($gzip){
1664 $chh_ctot .= '.gz';
1665 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
1666 }
1667 else{
1668 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
1669 }
1670
1671 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only);
1672 push @sorting_files,$chh_ctot;
1673
1674 unless($no_header){
1675 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1676 }
1677
1678 $chh_ctob =~ s/^/CHH_CTOB_/;
1679 $chh_ctob =~ s/sam$/txt/;
1680 $chh_ctob =~ s/bam$/txt/;
1681 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
1682 $chh_ctob = $output_dir . $chh_ctob;
1683
1684 if ($gzip){
1685 $chh_ctob .= '.gz';
1686 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
1687 }
1688 else{
1689 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
1690 }
1691
1692 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only);
1693 push @sorting_files,$chh_ctob;
1694
1695 unless($no_header){
1696 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1697 }
1698
1699 $chh_ob =~ s/^/CHH_OB_/;
1700 $chh_ob =~ s/sam$/txt/;
1701 $chh_ob =~ s/bam$/txt/;
1702 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
1703 $chh_ob = $output_dir . $chh_ob;
1704
1705 if ($gzip){
1706 $chh_ob .= '.gz';
1707 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
1708 }
1709 else{
1710 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
1711 }
1712
1713 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only);
1714 push @sorting_files,$chh_ob;
1715
1716 unless($no_header){
1717 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
1718 }
1719 }
1720
1721 my $methylation_call_strings_processed = 0;
1722 my $line_count = 0;
1723
1724 ### proceeding differently now for single-end or paired-end Bismark files
1725
1726 ### PROCESSING SINGLE-END RESULT FILES
1727 if ($single) {
1728
1729 ### also proceeding differently now for SAM format or vanilla Bismark format files
1730 if ($vanilla) { # old vanilla Bismark output format
1731 while (<IN>) {
1732 ++$line_count;
1733 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
1734
1735 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
1736 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
1737
1738 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
1739 ### last position
1740 chomp $genome_conversion;
1741
1742 my $index;
1743 if ($meth_call) {
1744
1745 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
1746 $index = 0;
1747 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
1748 $index = 1;
1749 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
1750 $index = 3;
1751 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
1752 $index = 2;
1753 } else {
1754 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
1755 }
1756
1757 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
1758 if ($ignore) {
1759 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
1760
1761 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
1762 if ($strand eq '+') {
1763 $start += $ignore;
1764 } elsif ($strand eq '-') {
1765 $start += length($meth_call)-1; ## $meth_call is already shortened!
1766 } else {
1767 die "Alignment did not have proper strand information: $strand\n";
1768 }
1769 }
1770 ### printing out the methylation state of every C in the read
1771 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
1772
1773 ++$methylation_call_strings_processed; # 1 per single-end result
1774 }
1775 }
1776 } else { # processing single-end SAM format (default)
1777 while (<IN>) {
1778 ### SAM format can either start with header lines (starting with @) or start with alignments directly
1779 if (/^\@/) { # skipping header lines (starting with @)
1780 warn "skipping SAM header line:\t$_";
1781 next;
1782 }
1783
1784 ++$line_count;
1785 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
1786
1787 # example read in SAM format
1788 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
1789 ###
1790
1791 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
1792 # < 0.7.6 $meth_call =~ s/^XM:Z://;
1793 # < 0.7.6 $read_conversion =~ s/^XR:Z://;
1794 # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
1795
1796 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
1797
1798 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
1799 my $meth_call; ### Thanks to Zachary Zeno for this solution
1800 my $read_conversion;
1801 my $genome_conversion;
1802
1803 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
1804 my $tag = $1;
1805 my $value = $2;
1806
1807 if ($tag eq "XM") {
1808 $meth_call = $value;
1809 $meth_call =~ s/\r//;
1810 } elsif ($tag eq "XR") {
1811 $read_conversion = $value;
1812 $read_conversion =~ s/\r//;
1813 } elsif ($tag eq "XG") {
1814 $genome_conversion = $value;
1815 $genome_conversion =~ s/\r//;
1816 }
1817 }
1818
1819 my $strand;
1820 chomp $genome_conversion;
1821 # print "$meth_call\n$read_conversion\n$genome_conversion\n";
1822
1823 my $index;
1824 if ($meth_call) {
1825 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
1826 $index = 0;
1827 $strand = '+';
1828 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
1829 $index = 1;
1830 $strand = '-';
1831 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
1832 $index = 2;
1833 $strand = '+';
1834 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
1835 $index = 3;
1836 $strand = '-';
1837 } else {
1838 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
1839 }
1840
1841 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
1842 if ($strand eq '-') {
1843 $meth_call = reverse $meth_call;
1844 }
1845
1846 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
1847 if ($ignore) {
1848 # print "\n\n$meth_call\n";
1849 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
1850 # print "$meth_call\n";
1851
1852 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
1853
1854 my @len = split (/\D+/,$cigar); # storing the length per operation
1855 my @ops = split (/\d+/,$cigar); # storing the operation
1856 shift @ops; # remove the empty first element
1857 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
1858
1859 my @comp_cigar; # building an array with all CIGAR operations
1860 foreach my $index (0..$#len) {
1861 foreach (1..$len[$index]) {
1862 # print "$ops[$index]";
1863 push @comp_cigar, $ops[$index];
1864 }
1865 }
1866 # print "original CIGAR: $cigar\n";
1867 # print "original CIGAR: @comp_cigar\n";
1868
1869 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
1870 if ($strand eq '+') {
1871
1872 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
1873 my $I_count = 0;
1874
1875 for (1..$ignore) {
1876 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
1877 # print "$_ deleted $op\n";
1878
1879 while ($op eq 'D') { # repeating this for deletions (D)
1880 $D_count++;
1881 $op = shift @comp_cigar;
1882 # print "$_ deleted $op\n";
1883 }
1884 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
1885 $I_count++;
1886 }
1887 }
1888 $start += $ignore + $D_count - $I_count;
1889 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
1890 } elsif ($strand eq '-') {
1891
1892 for (1..$ignore) {
1893 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
1894 while ($op eq 'D') { # repeating this for deletions (D)
1895 $op = pop @comp_cigar;
1896 }
1897 }
1898
1899 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
1900 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
1901 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
1902 foreach (@comp_cigar) {
1903 ++$MD_count if ($_ eq 'M' or $_ eq 'D');
1904 }
1905 $start += $MD_count - 1;
1906 }
1907
1908 ### reconstituting shortened CIGAR string
1909 my $new_cigar;
1910 my $count = 0;
1911 my $last_op;
1912 # print "ignore adjusted: @comp_cigar\n";
1913 foreach my $op (@comp_cigar) {
1914 unless (defined $last_op){
1915 $last_op = $op;
1916 ++$count;
1917 next;
1918 }
1919 if ($last_op eq $op) {
1920 ++$count;
1921 } else {
1922 $new_cigar .= "$count$last_op";
1923 $last_op = $op;
1924 $count = 1;
1925 }
1926 }
1927 $new_cigar .= "$count$last_op"; # appending the last operation and count
1928 $cigar = $new_cigar;
1929 # print "ignore adjusted scalar: $cigar\n";
1930 }
1931 }
1932 ### printing out the methylation state of every C in the read
1933 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
1934
1935 ++$methylation_call_strings_processed; # 1 per single-end result
1936 }
1937 }
1938 }
1939
1940 ### PROCESSING PAIRED-END RESULT FILES
1941 elsif ($paired) {
1942
1943 ### proceeding differently now for SAM format or vanilla Bismark format files
1944 if ($vanilla) { # old vanilla Bismark paired-end output format
1945 while (<IN>) {
1946 ++$line_count;
1947 warn "processed line: $line_count\n" if ($line_count%500000==0);
1948
1949 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
1950 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
1951
1952 my $index;
1953 chomp $genome_conversion;
1954
1955 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
1956 $index = 0; ## this is OT
1957 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
1958 $index = 2; ## this is CTOB!!!
1959 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
1960 $index = 1; ## this is CTOT!!!
1961 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
1962 $index = 3; ## this is OB
1963 } else {
1964 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
1965 }
1966
1967 if ($meth_call_1 and $meth_call_2) {
1968 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
1969
1970 if ($ignore) {
1971 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
1972
1973 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
1974 $start_read_1 += $ignore;
1975 }
1976 if ($ignore_r2) {
1977 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
1978
1979 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified
1980 $end_read_2 -= $ignore_r2;
1981 }
1982
1983 my $end_read_1;
1984 my $start_read_2;
1985
1986 if ($strand eq '+') {
1987
1988 $end_read_1 = $start_read_1+length($meth_call_1)-1;
1989 $start_read_2 = $end_read_2-length($meth_call_2)+1;
1990
1991 ## we first pass the first read which is in + orientation on the forward strand
1992 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity
1993
1994 # we next pass the second read which is in - orientation on the reverse strand
1995 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
1996 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2);
1997 }
1998 else {
1999
2000 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
2001 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
2002
2003 ## we first pass the first read which is in - orientation on the reverse strand
2004 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1);
2005
2006 # we next pass the second read which is in + orientation on the forward strand
2007 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
2008 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2);
2009 }
2010
2011 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
2012 }
2013 }
2014 }
2015 else { # Bismark paired-end SAM output format (default)
2016 while (<IN>) {
2017 ### SAM format can either start with header lines (starting with @) or start with alignments directly
2018 if (/^\@/) { # skipping header lines (starting with @)
2019 warn "skipping SAM header line:\t$_";
2020 next;
2021 }
2022
2023 ++$line_count;
2024 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
2025
2026 # example paired-end reads in SAM format (2 consecutive lines)
2027 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
2028 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
2029
2030 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
2031
2032 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
2033 my $meth_call_1;
2034 my $first_read_conversion;
2035 my $genome_conversion;
2036
2037 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
2038 my $tag = $1;
2039 my $value = $2;
2040
2041 if ($tag eq "XM") {
2042 $meth_call_1 = $value;
2043 $meth_call_1 =~ s/\r//;
2044 } elsif ($tag eq "XR") {
2045 $first_read_conversion = $value;
2046 $first_read_conversion =~ s/\r//;
2047 } elsif ($tag eq "XG") {
2048 $genome_conversion = $value;
2049 $genome_conversion =~ s/\r//;
2050 }
2051 }
2052
2053 $_ = <IN>; # reading in the paired read
2054
2055 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14];
2056 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://;
2057 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://;
2058 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://;
2059 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://;
2060
2061 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
2062
2063 my $meth_call_2;
2064 my $second_read_conversion;
2065
2066 while ( /(XM|XR):Z:([^\t]+)/g ) {
2067 my $tag = $1;
2068 my $value = $2;
2069
2070 if ($tag eq "XM") {
2071 $meth_call_2 = $value;
2072 $meth_call_2 =~ s/\r//;
2073 } elsif ($tag eq "XR") {
2074 $second_read_conversion = $value;
2075 $second_read_conversion = s/\r//;
2076 }
2077 }
2078
2079 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
2080 chomp $genome_conversion; # in case it captured a new line character
2081
2082 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
2083
2084 my $index;
2085 my $strand;
2086
2087 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
2088 $index = 0; ## this is OT
2089 $strand = '+';
2090 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
2091 $index = 1; ## this is CTOT
2092 $strand = '-';
2093 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
2094 $index = 2; ## this is CTOB
2095 $strand = '+';
2096 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
2097 $index = 3; ## this is OB
2098 $strand = '-';
2099 } else {
2100 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
2101 }
2102
2103 ### reversing the methylation call of the read that was reverse-complemented
2104 if ($strand eq '+') {
2105 $meth_call_2 = reverse $meth_call_2;
2106 } else {
2107 $meth_call_1 = reverse $meth_call_1;
2108 }
2109
2110 if ($meth_call_1 and $meth_call_2) {
2111
2112 my $end_read_1;
2113
2114 ### READ 1
2115 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
2116 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
2117 shift @ops_1; # remove the empty first element
2118
2119 die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1);
2120
2121 my @comp_cigar_1; # building an array with all CIGAR operations
2122 foreach my $index (0..$#len_1) {
2123 foreach (1..$len_1[$index]) {
2124 # print "$ops_1[$index]";
2125 push @comp_cigar_1, $ops_1[$index];
2126 }
2127 }
2128 # print "original CIGAR read 1: $cigar_1\n";
2129 # print "original CIGAR read 1: @comp_cigar_1\n";
2130
2131 ### READ 2
2132 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
2133 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
2134 shift @ops_2; # remove the empty first element
2135 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
2136 my @comp_cigar_2; # building an array with all CIGAR operations for read 2
2137 foreach my $index (0..$#len_2) {
2138 foreach (1..$len_2[$index]) {
2139 # print "$ops_2[$index]";
2140 push @comp_cigar_2, $ops_2[$index];
2141 }
2142 }
2143 # print "original CIGAR read 2: $cigar_2\n";
2144 # print "original CIGAR read 2: @comp_cigar_2\n";
2145
2146
2147
2148 if ($ignore) {
2149 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' for read 1
2150 ### the methylation calls have already been reversed where necessary
2151 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
2152
2153 if ($strand eq '+') {
2154
2155 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
2156 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
2157 my $I_count_1 = 0;
2158
2159 for (1..$ignore) {
2160 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
2161 # print "$_ deleted $op\n";
2162
2163 while ($op eq 'D') { # repeating this for deletions (D)
2164 $D_count_1++;
2165 $op = shift @comp_cigar_1;
2166 # print "$_ deleted $op\n";
2167 }
2168 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
2169 $I_count_1++;
2170 }
2171 }
2172
2173 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
2174 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
2175
2176 # the start position of reads mapping to the reverse strand is being adjusted further below
2177 }
2178 elsif ($strand eq '-') {
2179
2180 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
2181 for (1..$ignore) {
2182 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
2183 while ($op eq 'D') { # repeating this for deletions (D)
2184 $op = pop @comp_cigar_1;
2185 }
2186 }
2187 # the start position of reads mapping to the reverse strand is being adjusted further below
2188
2189 }
2190 }
2191
2192 if ($ignore_r2) {
2193 ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore_r2 <int>' for read 2
2194 ### the methylation calls have already been reversed where necessary
2195 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
2196
2197 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
2198
2199 if ($strand eq '+') {
2200
2201 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
2202
2203 for (1..$ignore_r2) {
2204 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
2205 while ($op eq 'D') { # repeating this for deletions (D)
2206 $op = pop @comp_cigar_2;
2207 }
2208 }
2209 # the start position of reads mapping to the reverse strand is being adjusted further below
2210 }
2211 elsif ($strand eq '-') {
2212
2213 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
2214 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
2215 my $I_count_2 = 0;
2216
2217 for (1..$ignore_r2) {
2218 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
2219 # print "$_ deleted $op\n";
2220
2221 while ($op eq 'D') { # repeating this for deletions (D)
2222 $D_count_2++;
2223 $op = shift @comp_cigar_2;
2224 # print "$_ deleted $op\n";
2225 }
2226 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
2227 $I_count_2++;
2228 }
2229 }
2230
2231 $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2;
2232 # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
2233 }
2234 }
2235
2236 if ($ignore){
2237 ### reconstituting shortened CIGAR string 1
2238 my $new_cigar_1;
2239 my $count_1 = 0;
2240 my $last_op_1;
2241 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
2242 foreach my $op (@comp_cigar_1) {
2243 unless (defined $last_op_1){
2244 $last_op_1 = $op;
2245 ++$count_1;
2246 next;
2247 }
2248 if ($last_op_1 eq $op) {
2249 ++$count_1;
2250 } else {
2251 $new_cigar_1 .= "$count_1$last_op_1";
2252 $last_op_1 = $op;
2253 $count_1 = 1;
2254 }
2255 }
2256 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
2257 $cigar_1 = $new_cigar_1;
2258 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
2259 }
2260
2261 if ($ignore_r2){
2262
2263 ### reconstituting shortened CIGAR string 2
2264 my $new_cigar_2;
2265 my $count_2 = 0;
2266 my $last_op_2;
2267 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
2268 foreach my $op (@comp_cigar_2) {
2269 unless (defined $last_op_2){
2270 $last_op_2 = $op;
2271 ++$count_2;
2272 next;
2273 }
2274 if ($last_op_2 eq $op) {
2275 ++$count_2;
2276 }
2277 else {
2278 $new_cigar_2 .= "$count_2$last_op_2";
2279 $last_op_2 = $op;
2280 $count_2 = 1;
2281 }
2282 }
2283 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
2284 $cigar_2 = $new_cigar_2;
2285 # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n";
2286 }
2287
2288 ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on
2289
2290 if ($strand eq '+') {
2291 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
2292 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2293 # print "reverse: @comp_cigar_2\n";
2294
2295 my $MD_count_1 = 0;
2296 foreach (@comp_cigar_1) {
2297 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
2298 }
2299
2300 my $MD_count_2 = 0;
2301 foreach (@comp_cigar_2) {
2302 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
2303 }
2304
2305 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
2306 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
2307 }
2308 else {
2309 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
2310
2311 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2312 # print "reverse: @comp_cigar_1\n";
2313
2314 my $MD_count_1 = 0;
2315 foreach (@comp_cigar_1) {
2316 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
2317 }
2318
2319 $end_read_1 = $start_read_1;
2320 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
2321 }
2322
2323 if ($strand eq '+') {
2324 ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity
2325 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1);
2326
2327 # we next pass the second read which is in - orientation on the reverse strand
2328 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
2329 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2);
2330 } else {
2331 ## we first pass the first read which is in - orientation on the reverse strand
2332 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1);
2333
2334 # we next pass the second read which is in + orientation on the forward strand
2335 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
2336 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2);
2337 }
2338
2339 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
2340 }
2341 }
2342 }
2343 } else {
2344 die "Single-end or paired-end reads not specified properly\n";
2345 }
2346
2347 warn "\n\nProcessed $line_count lines from $filename in total\n";
2348 warn "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
2349 if ($report) {
2350 print REPORT "\n\nProcessed $line_count lines from $filename in total\n";
2351 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
2352 }
2353 print_splitting_report ();
2354 }
2355
2356
2357
2358 sub print_splitting_report{
2359
2360 ### Calculating methylation percentages if applicable
2361
2362 my $percent_meCpG;
2363 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
2364 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
2365 }
2366
2367 my $percent_meCHG;
2368 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
2369 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
2370 }
2371
2372 my $percent_meCHH;
2373 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
2374 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
2375 }
2376
2377 my $percent_non_CpG_methylation;
2378 if ($merge_non_CpG){
2379 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
2380 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
2381 }
2382 }
2383
2384 if ($report){
2385 ### detailed information about Cs analysed
2386 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
2387
2388 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
2389 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
2390
2391 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
2392 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
2393 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
2394
2395 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
2396 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
2397 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
2398
2399 ### calculating methylated CpG percentage if applicable
2400 if ($percent_meCpG){
2401 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
2402 }
2403 else{
2404 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
2405 }
2406
2407 ### 2-Context Output
2408 if ($merge_non_CpG){
2409 if ($percent_non_CpG_methylation){
2410 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
2411 }
2412 else{
2413 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
2414 }
2415 }
2416
2417 ### 3 Context Output
2418 else{
2419 ### calculating methylated CHG percentage if applicable
2420 if ($percent_meCHG){
2421 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
2422 }
2423 else{
2424 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2425 }
2426
2427 ### calculating methylated CHH percentage if applicable
2428 if ($percent_meCHH){
2429 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
2430 }
2431 else{
2432 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
2433 }
2434 }
2435 }
2436
2437 ### detailed information about Cs analysed for on-screen report
2438 print "Final Cytosine Methylation Report\n",'='x33,"\n";
2439
2440 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
2441 print "Total number of C's analysed:\t$total_number_of_C\n\n";
2442
2443 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
2444 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
2445 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
2446
2447 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
2448 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
2449 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
2450
2451 ### printing methylated CpG percentage if applicable
2452 if ($percent_meCpG){
2453 print "C methylated in CpG context:\t${percent_meCpG}%\n";
2454 }
2455 else{
2456 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
2457 }
2458
2459 ### 2-Context Output
2460 if ($merge_non_CpG){
2461 if ($percent_non_CpG_methylation){
2462 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
2463 }
2464 else{
2465 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
2466 }
2467 }
2468
2469 ### 3-Context Output
2470 else{
2471 ### printing methylated CHG percentage if applicable
2472 if ($percent_meCHG){
2473 print "C methylated in CHG context:\t${percent_meCHG}%\n";
2474 }
2475 else{
2476 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2477 }
2478
2479 ### printing methylated CHH percentage if applicable
2480 if ($percent_meCHH){
2481 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
2482 }
2483 else{
2484 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
2485 }
2486 }
2487 }
2488
2489
2490
2491 sub print_individual_C_methylation_states_paired_end_files{
2492
2493 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_;
2494
2495 ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2
2496 die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2);
2497
2498 my @methylation_calls = split(//,$meth_call);
2499
2500 #################################################################
2501 ### . for bases not involving cytosines ###
2502 ### X for methylated C in CHG context (was protected) ###
2503 ### x for not methylated C in CHG context (was converted) ###
2504 ### H for methylated C in CHH context (was protected) ###
2505 ### h for not methylated C in CHH context (was converted) ###
2506 ### Z for methylated C in CpG context (was protected) ###
2507 ### z for not methylated C in CpG context (was converted) ###
2508 ### U for methylated C in Unknown context (was protected) ###
2509 ### u for not methylated C in Unknown context (was converted) ###
2510 #################################################################
2511
2512 my $methyl_CHG_count = 0;
2513 my $methyl_CHH_count = 0;
2514 my $methyl_CpG_count = 0;
2515 my $unmethylated_CHG_count = 0;
2516 my $unmethylated_CHH_count = 0;
2517 my $unmethylated_CpG_count = 0;
2518
2519 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
2520 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
2521 my @comp_cigar;
2522
2523 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
2524 if ($cigar =~ /^\d+M$/){
2525 # this check speeds up the extraction process by up to 60%!!!
2526 }
2527 else{ # parsing CIGAR string
2528 my @len;
2529 my @ops;
2530 @len = split (/\D+/,$cigar); # storing the length per operation
2531 @ops = split (/\d+/,$cigar); # storing the operation
2532 shift @ops; # remove the empty first element
2533
2534 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
2535
2536 foreach my $index (0..$#len){
2537 foreach (1..$len[$index]){
2538 # print "$ops[$index]";
2539 push @comp_cigar, $ops[$index];
2540 }
2541 }
2542 # warn "\nDetected CIGAR string: $cigar\n";
2543 # warn "Length of methylation call: ",length $meth_call,"\n";
2544 # warn "number of operations: ",scalar @ops,"\n";
2545 # warn "number of length digits: ",scalar @len,"\n\n";
2546 # print @comp_cigar,"\n";
2547 # print "$meth_call\n\n";
2548 # sleep (1);
2549 }
2550
2551 if ($strand eq '-') {
2552
2553 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
2554 if (@comp_cigar){
2555 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
2556 }
2557 # print "reverse CIGAR string: @comp_cigar\n";
2558
2559 ### the start position of paired-end files has already been corrected, see above
2560 }
2561
2562 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
2563
2564 if ($merge_non_CpG) {
2565 if ($no_overlap) { # this has to be read 2...
2566
2567 ### single-file CpG and non-CpG context output
2568 if ($full) {
2569 if ($strand eq '+') {
2570 for my $index (0..$#methylation_calls) {
2571
2572 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2573 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2574 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2575 $cigar_offset += $cigar_mod;
2576 $pos_offset += $pos_mod;
2577 }
2578
2579 ### Returning as soon as the methylation calls start overlapping
2580 if ($start+$index+$pos_offset >= $end_read_1) {
2581 return;
2582 }
2583
2584 if ($methylation_calls[$index] eq 'X') {
2585 $counting{total_meCHG_count}++;
2586 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2587 if ($read_identity == 1){
2588 $mbias_1{CHG}->{$index+1}->{meth}++;
2589 }
2590 else{
2591 $mbias_2{CHG}->{$index+1}->{meth}++;
2592 }
2593 }
2594 elsif ($methylation_calls[$index] eq 'x') {
2595 $counting{total_unmethylated_CHG_count}++;
2596 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2597 if ($read_identity == 1){
2598 $mbias_1{CHG}->{$index+1}->{un}++;
2599 }
2600 else{
2601 $mbias_2{CHG}->{$index+1}->{un}++;
2602 }
2603 }
2604 elsif ($methylation_calls[$index] eq 'Z') {
2605 $counting{total_meCpG_count}++;
2606 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2607 if ($read_identity == 1){
2608 $mbias_1{CpG}->{$index+1}->{meth}++;
2609 }
2610 else{
2611 $mbias_2{CpG}->{$index+1}->{meth}++;
2612 }
2613 }
2614 elsif ($methylation_calls[$index] eq 'z') {
2615 $counting{total_unmethylated_CpG_count}++;
2616 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2617 if ($read_identity == 1){
2618 $mbias_1{CpG}->{$index+1}->{un}++;
2619 }
2620 else{
2621 $mbias_2{CpG}->{$index+1}->{un}++;
2622 }
2623 }
2624 elsif ($methylation_calls[$index] eq 'H') {
2625 $counting{total_meCHH_count}++;
2626 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2627 if ($read_identity == 1){
2628 $mbias_1{CHH}->{$index+1}->{meth}++;
2629 }
2630 else{
2631 $mbias_2{CHH}->{$index+1}->{meth}++;
2632 }
2633 }
2634 elsif ($methylation_calls[$index] eq 'h') {
2635 $counting{total_unmethylated_CHH_count}++;
2636 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2637 if ($read_identity == 1){
2638 $mbias_1{CHH}->{$index+1}->{un}++;
2639 }
2640 else{
2641 $mbias_2{CHH}->{$index+1}->{un}++;
2642 }
2643 }
2644 elsif ($methylation_calls[$index] eq '.'){}
2645 elsif (lc$methylation_calls[$index] eq 'u'){}
2646 else{
2647 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
2648 }
2649 }
2650 }
2651 elsif ($strand eq '-') {
2652 for my $index (0..$#methylation_calls) {
2653
2654 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2655 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2656 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2657 $cigar_offset += $cigar_mod;
2658 $pos_offset += $pos_mod;
2659 }
2660
2661 ### Returning as soon as the methylation calls start overlapping
2662 if ($start-$index+$pos_offset <= $end_read_1) {
2663 return;
2664 }
2665
2666 if ($methylation_calls[$index] eq 'X') {
2667 $counting{total_meCHG_count}++;
2668 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2669 if ($read_identity == 1){
2670 $mbias_1{CHG}->{$index+1}->{meth}++;
2671 }
2672 else{
2673 $mbias_2{CHG}->{$index+1}->{meth}++;
2674 }
2675 }
2676 elsif ($methylation_calls[$index] eq 'x') {
2677 $counting{total_unmethylated_CHG_count}++;
2678 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2679 if ($read_identity == 1){
2680 $mbias_1{CHG}->{$index+1}->{un}++;
2681 }
2682 else{
2683 $mbias_2{CHG}->{$index+1}->{un}++;
2684 }
2685 }
2686 elsif ($methylation_calls[$index] eq 'Z') {
2687 $counting{total_meCpG_count}++;
2688 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2689 if ($read_identity == 1){
2690 $mbias_1{CpG}->{$index+1}->{meth}++;
2691 }
2692 else{
2693 $mbias_2{CpG}->{$index+1}->{meth}++;
2694 }
2695 }
2696 elsif ($methylation_calls[$index] eq 'z') {
2697 $counting{total_unmethylated_CpG_count}++;
2698 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2699 if ($read_identity == 1){
2700 $mbias_1{CpG}->{$index+1}->{un}++;
2701 }
2702 else{
2703 $mbias_2{CpG}->{$index+1}->{un}++;
2704 }
2705 }
2706 elsif ($methylation_calls[$index] eq 'H') {
2707 $counting{total_meCHH_count}++;
2708 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2709 if ($read_identity == 1){
2710 $mbias_1{CHH}->{$index+1}->{meth}++;
2711 }
2712 else{
2713 $mbias_2{CHH}->{$index+1}->{meth}++;
2714 }
2715 }
2716 elsif ($methylation_calls[$index] eq 'h') {
2717 $counting{total_unmethylated_CHH_count}++;
2718 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2719 if ($read_identity == 1){
2720 $mbias_1{CHH}->{$index+1}->{un}++;
2721 }
2722 else{
2723 $mbias_2{CHH}->{$index+1}->{un}++;
2724 }
2725 }
2726 elsif ($methylation_calls[$index] eq '.') {}
2727 elsif (lc$methylation_calls[$index] eq 'u'){}
2728 else{
2729 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
2730 }
2731 }
2732 } else {
2733 die "The read orientation was neither + nor -: '$strand'\n";
2734 }
2735 }
2736
2737 ### strand-specific methylation output
2738 else {
2739 if ($strand eq '+') {
2740 for my $index (0..$#methylation_calls) {
2741
2742 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2743 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2744 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2745 $cigar_offset += $cigar_mod;
2746 $pos_offset += $pos_mod;
2747 }
2748
2749 ### Returning as soon as the methylation calls start overlapping
2750 if ($start+$index+$pos_offset >= $end_read_1) {
2751 return;
2752 }
2753
2754 if ($methylation_calls[$index] eq 'X') {
2755 $counting{total_meCHG_count}++;
2756 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2757 if ($read_identity == 1){
2758 $mbias_1{CHG}->{$index+1}->{meth}++;
2759 }
2760 else{
2761 $mbias_2{CHG}->{$index+1}->{meth}++;
2762 }
2763 }
2764 elsif ($methylation_calls[$index] eq 'x') {
2765 $counting{total_unmethylated_CHG_count}++;
2766 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2767 if ($read_identity == 1){
2768 $mbias_1{CHG}->{$index+1}->{un}++;
2769 }
2770 else{
2771 $mbias_2{CHG}->{$index+1}->{un}++;
2772 }
2773 }
2774 elsif ($methylation_calls[$index] eq 'Z') {
2775 $counting{total_meCpG_count}++;
2776 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2777 if ($read_identity == 1){
2778 $mbias_1{CpG}->{$index+1}->{meth}++;
2779 }
2780 else{
2781 $mbias_2{CpG}->{$index+1}->{meth}++;
2782 }
2783 }
2784 elsif ($methylation_calls[$index] eq 'z') {
2785 $counting{total_unmethylated_CpG_count}++;
2786 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2787 if ($read_identity == 1){
2788 $mbias_1{CpG}->{$index+1}->{un}++;
2789 }
2790 else{
2791 $mbias_2{CpG}->{$index+1}->{un}++;
2792 }
2793 }
2794 elsif ($methylation_calls[$index] eq 'H') {
2795 $counting{total_meCHH_count}++;
2796 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2797 if ($read_identity == 1){
2798 $mbias_1{CHH}->{$index+1}->{meth}++;
2799 }
2800 else{
2801 $mbias_2{CHH}->{$index+1}->{meth}++;
2802 }
2803 }
2804 elsif ($methylation_calls[$index] eq 'h') {
2805 $counting{total_unmethylated_CHH_count}++;
2806 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2807 if ($read_identity == 1){
2808 $mbias_1{CHH}->{$index+1}->{un}++;
2809 }
2810 else{
2811 $mbias_2{CHH}->{$index+1}->{un}++;
2812 }
2813 }
2814 elsif ($methylation_calls[$index] eq '.') {}
2815 elsif (lc$methylation_calls[$index] eq 'u'){}
2816 else{
2817 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2818 }
2819 }
2820 } elsif ($strand eq '-') {
2821 for my $index (0..$#methylation_calls) {
2822
2823 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2824 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2825 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2826 $cigar_offset += $cigar_mod;
2827 $pos_offset += $pos_mod;
2828 }
2829
2830 ### Returning as soon as the methylation calls start overlapping
2831 if ($start-$index+$pos_offset <= $end_read_1) {
2832 return;
2833 }
2834
2835 if ($methylation_calls[$index] eq 'X') {
2836 $counting{total_meCHG_count}++;
2837 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2838 if ($read_identity == 1){
2839 $mbias_1{CHG}->{$index+1}->{meth}++;
2840 }
2841 else{
2842 $mbias_2{CHG}->{$index+1}->{meth}++;
2843 }
2844 }
2845 elsif ($methylation_calls[$index] eq 'x') {
2846 $counting{total_unmethylated_CHG_count}++;
2847 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2848 if ($read_identity == 1){
2849 $mbias_1{CHG}->{$index+1}->{un}++;
2850 }
2851 else{
2852 $mbias_2{CHG}->{$index+1}->{un}++;
2853 }
2854 }
2855 elsif ($methylation_calls[$index] eq 'Z') {
2856 $counting{total_meCpG_count}++;
2857 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2858 if ($read_identity == 1){
2859 $mbias_1{CpG}->{$index+1}->{meth}++;
2860 }
2861 else{
2862 $mbias_2{CpG}->{$index+1}->{meth}++;
2863 }
2864 }
2865 elsif ($methylation_calls[$index] eq 'z') {
2866 $counting{total_unmethylated_CpG_count}++;
2867 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2868 if ($read_identity == 1){
2869 $mbias_1{CpG}->{$index+1}->{un}++;
2870 }
2871 else{
2872 $mbias_2{CpG}->{$index+1}->{un}++;
2873 }
2874 }
2875 elsif ($methylation_calls[$index] eq 'H') {
2876 $counting{total_meCHH_count}++;
2877 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2878 if ($read_identity == 1){
2879 $mbias_1{CHH}->{$index+1}->{meth}++;
2880 }
2881 else{
2882 $mbias_2{CHH}->{$index+1}->{meth}++;
2883 }
2884 }
2885 elsif ($methylation_calls[$index] eq 'h') {
2886 $counting{total_unmethylated_CHH_count}++;
2887 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2888 if ($read_identity == 1){
2889 $mbias_1{CHH}->{$index+1}->{un}++;
2890 }
2891 else{
2892 $mbias_2{CHH}->{$index+1}->{un}++;
2893 }
2894 }
2895 elsif ($methylation_calls[$index] eq '.') {}
2896 elsif (lc$methylation_calls[$index] eq 'u'){}
2897 else{
2898 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
2899 }
2900 }
2901 } else {
2902 die "The strand orientation was neither + nor -: '$strand'/n";
2903 }
2904 }
2905 }
2906
2907 ### this is the default paired-end procedure allowing overlaps and using every single C position
2908 ### Still within the 2-CONTEXT ONLY optional section
2909 else {
2910 ### single-file CpG and non-CpG context output
2911 if ($full) {
2912 if ($strand eq '+') {
2913 for my $index (0..$#methylation_calls) {
2914
2915 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2916 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2917 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
2918 $cigar_offset += $cigar_mod;
2919 $pos_offset += $pos_mod;
2920 }
2921
2922 if ($methylation_calls[$index] eq 'X') {
2923 $counting{total_meCHG_count}++;
2924 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2925 if ($read_identity == 1){
2926 $mbias_1{CHG}->{$index+1}->{meth}++;
2927 }
2928 else{
2929 $mbias_2{CHG}->{$index+1}->{meth}++;
2930 }
2931 }
2932 elsif ($methylation_calls[$index] eq 'x') {
2933 $counting{total_unmethylated_CHG_count}++;
2934 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2935 if ($read_identity == 1){
2936 $mbias_1{CHG}->{$index+1}->{un}++;
2937 }
2938 else{
2939 $mbias_2{CHG}->{$index+1}->{un}++;
2940 }
2941 }
2942 elsif ($methylation_calls[$index] eq 'Z') {
2943 $counting{total_meCpG_count}++;
2944 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2945 if ($read_identity == 1){
2946 $mbias_1{CpG}->{$index+1}->{meth}++;
2947 }
2948 else{
2949 $mbias_2{CpG}->{$index+1}->{meth}++;
2950 }
2951 }
2952 elsif ($methylation_calls[$index] eq 'z') {
2953 $counting{total_unmethylated_CpG_count}++;
2954 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2955 if ($read_identity == 1){
2956 $mbias_1{CpG}->{$index+1}->{un}++;
2957 }
2958 else{
2959 $mbias_2{CpG}->{$index+1}->{un}++;
2960 }
2961 }
2962 elsif ($methylation_calls[$index] eq 'H') {
2963 $counting{total_meCHH_count}++;
2964 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2965 if ($read_identity == 1){
2966 $mbias_1{CHH}->{$index+1}->{meth}++;
2967 }
2968 else{
2969 $mbias_2{CHH}->{$index+1}->{meth}++;
2970 }
2971 }
2972 elsif ($methylation_calls[$index] eq 'h') {
2973 $counting{total_unmethylated_CHH_count}++;
2974 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
2975 if ($read_identity == 1){
2976 $mbias_1{CHH}->{$index+1}->{un}++;
2977 }
2978 else{
2979 $mbias_2{CHH}->{$index+1}->{un}++;
2980 }
2981 }
2982 elsif ($methylation_calls[$index] eq '.') {}
2983 elsif (lc$methylation_calls[$index] eq 'u'){}
2984 else{
2985 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
2986 }
2987 }
2988 } elsif ($strand eq '-') {
2989 for my $index (0..$#methylation_calls) {
2990
2991 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
2992 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
2993 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
2994 $cigar_offset += $cigar_mod;
2995 $pos_offset += $pos_mod;
2996 }
2997
2998 if ($methylation_calls[$index] eq 'X') {
2999 $counting{total_meCHG_count}++;
3000 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3001 if ($read_identity == 1){
3002 $mbias_1{CHG}->{$index+1}->{meth}++;
3003 }
3004 else{
3005 $mbias_2{CHG}->{$index+1}->{meth}++;
3006 }
3007 }
3008 elsif ($methylation_calls[$index] eq 'x') {
3009 $counting{total_unmethylated_CHG_count}++;
3010 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3011 if ($read_identity == 1){
3012 $mbias_1{CHG}->{$index+1}->{un}++;
3013 }
3014 else{
3015 $mbias_2{CHG}->{$index+1}->{un}++;
3016 }
3017 }
3018 elsif ($methylation_calls[$index] eq 'Z') {
3019 $counting{total_meCpG_count}++;
3020 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3021 if ($read_identity == 1){
3022 $mbias_1{CpG}->{$index+1}->{meth}++;
3023 }
3024 else{
3025 $mbias_2{CpG}->{$index+1}->{meth}++;
3026 }
3027 }
3028 elsif ($methylation_calls[$index] eq 'z') {
3029 $counting{total_unmethylated_CpG_count}++;
3030 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3031 if ($read_identity == 1){
3032 $mbias_1{CpG}->{$index+1}->{un}++;
3033 }
3034 else{
3035 $mbias_2{CpG}->{$index+1}->{un}++;
3036 }
3037 }
3038 elsif ($methylation_calls[$index] eq 'H') {
3039 $counting{total_meCHH_count}++;
3040 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3041 if ($read_identity == 1){
3042 $mbias_1{CHH}->{$index+1}->{meth}++;
3043 }
3044 else{
3045 $mbias_2{CHH}->{$index+1}->{meth}++;
3046 }
3047 }
3048 elsif ($methylation_calls[$index] eq 'h') {
3049 $counting{total_unmethylated_CHH_count}++;
3050 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3051 if ($read_identity == 1){
3052 $mbias_1{CHH}->{$index+1}->{un}++;
3053 }
3054 else{
3055 $mbias_2{CHH}->{$index+1}->{un}++;
3056 }
3057 }
3058 elsif ($methylation_calls[$index] eq '.') {}
3059 elsif (lc$methylation_calls[$index] eq 'u'){}
3060 else{
3061 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
3062 }
3063 }
3064 } else {
3065 die "The strand orientation as neither + nor -: '$strand'\n";
3066 }
3067 }
3068
3069 ### strand-specific methylation output
3070 ### still within the 2-CONTEXT optional section
3071 else {
3072 if ($strand eq '+') {
3073 for my $index (0..$#methylation_calls) {
3074
3075 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3076 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3077 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3078 $cigar_offset += $cigar_mod;
3079 $pos_offset += $pos_mod;
3080 }
3081
3082 if ($methylation_calls[$index] eq 'X') {
3083 $counting{total_meCHG_count}++;
3084 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3085 if ($read_identity == 1){
3086 $mbias_1{CHG}->{$index+1}->{meth}++;
3087 }
3088 else{
3089 $mbias_2{CHG}->{$index+1}->{meth}++;
3090 }
3091 }
3092 elsif ($methylation_calls[$index] eq 'x') {
3093 $counting{total_unmethylated_CHG_count}++;
3094 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3095 if ($read_identity == 1){
3096 $mbias_1{CHG}->{$index+1}->{un}++;
3097 }
3098 else{
3099 $mbias_2{CHG}->{$index+1}->{un}++;
3100 }
3101 }
3102 elsif ($methylation_calls[$index] eq 'Z') {
3103 $counting{total_meCpG_count}++;
3104 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3105 if ($read_identity == 1){
3106 $mbias_1{CpG}->{$index+1}->{meth}++;
3107 }
3108 else{
3109 $mbias_2{CpG}->{$index+1}->{meth}++;
3110 }
3111 }
3112 elsif ($methylation_calls[$index] eq 'z') {
3113 $counting{total_unmethylated_CpG_count}++;
3114 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3115 if ($read_identity == 1){
3116 $mbias_1{CpG}->{$index+1}->{un}++;
3117 }
3118 else{
3119 $mbias_2{CpG}->{$index+1}->{un}++;
3120 }
3121 }
3122 elsif ($methylation_calls[$index] eq 'H') {
3123 $counting{total_meCHH_count}++;
3124 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3125 if ($read_identity == 1){
3126 $mbias_1{CHH}->{$index+1}->{meth}++;
3127 }
3128 else{
3129 $mbias_2{CHH}->{$index+1}->{meth}++;
3130 }
3131 }
3132 elsif ($methylation_calls[$index] eq 'h') {
3133 $counting{total_unmethylated_CHH_count}++;
3134 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3135 if ($read_identity == 1){
3136 $mbias_1{CHH}->{$index+1}->{un}++;
3137 }
3138 else{
3139 $mbias_2{CHH}->{$index+1}->{un}++;
3140 }
3141 }
3142 elsif ($methylation_calls[$index] eq '.') {}
3143 elsif (lc$methylation_calls[$index] eq 'u'){}
3144 else{
3145 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3146 }
3147 }
3148 } elsif ($strand eq '-') {
3149 for my $index (0..$#methylation_calls) {
3150
3151 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3152 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3153 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3154 $cigar_offset += $cigar_mod;
3155 $pos_offset += $pos_mod;
3156 }
3157
3158 if ($methylation_calls[$index] eq 'X') {
3159 $counting{total_meCHG_count}++;
3160 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3161 if ($read_identity == 1){
3162 $mbias_1{CHG}->{$index+1}->{meth}++;
3163 }
3164 else{
3165 $mbias_2{CHG}->{$index+1}->{meth}++;
3166 }
3167 }
3168 elsif ($methylation_calls[$index] eq 'x') {
3169 $counting{total_unmethylated_CHG_count}++;
3170 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3171 if ($read_identity == 1){
3172 $mbias_1{CHG}->{$index+1}->{un}++;
3173 }
3174 else{
3175 $mbias_2{CHG}->{$index+1}->{un}++;
3176 }
3177 }
3178 elsif ($methylation_calls[$index] eq 'Z') {
3179 $counting{total_meCpG_count}++;
3180 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3181 if ($read_identity == 1){
3182 $mbias_1{CpG}->{$index+1}->{meth}++;
3183 }
3184 else{
3185 $mbias_2{CpG}->{$index+1}->{meth}++;
3186 }
3187 }
3188 elsif ($methylation_calls[$index] eq 'z') {
3189 $counting{total_unmethylated_CpG_count}++;
3190 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3191 if ($read_identity == 1){
3192 $mbias_1{CpG}->{$index+1}->{un}++;
3193 }
3194 else{
3195 $mbias_2{CpG}->{$index+1}->{un}++;
3196 }
3197 }
3198 elsif ($methylation_calls[$index] eq 'H') {
3199 $counting{total_meCHH_count}++;
3200 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3201 if ($read_identity == 1){
3202 $mbias_1{CHH}->{$index+1}->{meth}++;
3203 }
3204 else{
3205 $mbias_2{CHH}->{$index+1}->{meth}++;
3206 }
3207 }
3208 elsif ($methylation_calls[$index] eq 'h') {
3209 $counting{total_unmethylated_CHH_count}++;
3210 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3211 if ($read_identity == 1){
3212 $mbias_1{CHH}->{$index+1}->{un}++;
3213 }
3214 else{
3215 $mbias_2{CHH}->{$index+1}->{un}++;
3216 }
3217 }
3218 elsif ($methylation_calls[$index] eq '.') {}
3219 elsif (lc$methylation_calls[$index] eq 'u'){}
3220 else{
3221 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3222 }
3223 }
3224 } else {
3225 die "The strand orientation as neither + nor -: '$strand'\n";
3226 }
3227 }
3228 }
3229 }
3230
3231 ############################################
3232 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
3233 ############################################
3234
3235 elsif ($no_overlap) {
3236 ### single-file CpG, CHG and CHH context output
3237 if ($full) {
3238 if ($strand eq '+') {
3239 for my $index (0..$#methylation_calls) {
3240
3241 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3242 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3243 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3244 $cigar_offset += $cigar_mod;
3245 $pos_offset += $pos_mod;
3246 }
3247
3248 ### Returning as soon as the methylation calls start overlapping
3249 if ($start+$index+$pos_offset >= $end_read_1) {
3250 return;
3251 }
3252
3253 if ($methylation_calls[$index] eq 'X') {
3254 $counting{total_meCHG_count}++;
3255 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3256 if ($read_identity == 1){
3257 $mbias_1{CHG}->{$index+1}->{meth}++;
3258 }
3259 else{
3260 $mbias_2{CHG}->{$index+1}->{meth}++;
3261 }
3262 }
3263 elsif ($methylation_calls[$index] eq 'x') {
3264 $counting{total_unmethylated_CHG_count}++;
3265 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3266 if ($read_identity == 1){
3267 $mbias_1{CHG}->{$index+1}->{un}++;
3268 }
3269 else{
3270 $mbias_2{CHG}->{$index+1}->{un}++;
3271 }
3272 }
3273 elsif ($methylation_calls[$index] eq 'Z') {
3274 $counting{total_meCpG_count}++;
3275 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3276 if ($read_identity == 1){
3277 $mbias_1{CpG}->{$index+1}->{meth}++;
3278 }
3279 else{
3280 $mbias_2{CpG}->{$index+1}->{meth}++;
3281 }
3282 }
3283 elsif ($methylation_calls[$index] eq 'z') {
3284 $counting{total_unmethylated_CpG_count}++;
3285 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3286 if ($read_identity == 1){
3287 $mbias_1{CpG}->{$index+1}->{un}++;
3288 }
3289 else{
3290 $mbias_2{CpG}->{$index+1}->{un}++;
3291 }
3292 }
3293 elsif ($methylation_calls[$index] eq 'H') {
3294 $counting{total_meCHH_count}++;
3295 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3296 if ($read_identity == 1){
3297 $mbias_1{CHH}->{$index+1}->{meth}++;
3298 }
3299 else{
3300 $mbias_2{CHH}->{$index+1}->{meth}++;
3301 }
3302 }
3303 elsif ($methylation_calls[$index] eq 'h') {
3304 $counting{total_unmethylated_CHH_count}++;
3305 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3306 if ($read_identity == 1){
3307 $mbias_1{CHH}->{$index+1}->{un}++;
3308 }
3309 else{
3310 $mbias_2{CHH}->{$index+1}->{un}++;
3311 }
3312 }
3313 elsif ($methylation_calls[$index] eq '.') {}
3314 elsif (lc$methylation_calls[$index] eq 'u'){}
3315 else{
3316 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3317 }
3318 }
3319 } elsif ($strand eq '-') {
3320 for my $index (0..$#methylation_calls) {
3321
3322 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3323 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3324 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3325 $cigar_offset += $cigar_mod;
3326 $pos_offset += $pos_mod;
3327 }
3328
3329 ### Returning as soon as the methylation calls start overlapping
3330 if ($start-$index+$pos_offset <= $end_read_1) {
3331 return;
3332 }
3333
3334 if ($methylation_calls[$index] eq 'X') {
3335 $counting{total_meCHG_count}++;
3336 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3337 if ($read_identity == 1){
3338 $mbias_1{CHG}->{$index+1}->{meth}++;
3339 }
3340 else{
3341 $mbias_2{CHG}->{$index+1}->{meth}++;
3342 }
3343 }
3344 elsif ($methylation_calls[$index] eq 'x') {
3345 $counting{total_unmethylated_CHG_count}++;
3346 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3347 if ($read_identity == 1){
3348 $mbias_1{CHG}->{$index+1}->{un}++;
3349 }
3350 else{
3351 $mbias_2{CHG}->{$index+1}->{un}++;
3352 }
3353 }
3354 elsif ($methylation_calls[$index] eq 'Z') {
3355 $counting{total_meCpG_count}++;
3356 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3357 if ($read_identity == 1){
3358 $mbias_1{CpG}->{$index+1}->{meth}++;
3359 }
3360 else{
3361 $mbias_2{CpG}->{$index+1}->{meth}++;
3362 }
3363 }
3364 elsif ($methylation_calls[$index] eq 'z') {
3365 $counting{total_unmethylated_CpG_count}++;
3366 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3367 if ($read_identity == 1){
3368 $mbias_1{CpG}->{$index+1}->{un}++;
3369 }
3370 else{
3371 $mbias_2{CpG}->{$index+1}->{un}++;
3372 }
3373 }
3374 elsif ($methylation_calls[$index] eq 'H') {
3375 $counting{total_meCHH_count}++;
3376 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3377 if ($read_identity == 1){
3378 $mbias_1{CHH}->{$index+1}->{meth}++;
3379 }
3380 else{
3381 $mbias_2{CHH}->{$index+1}->{meth}++;
3382 }
3383 }
3384 elsif ($methylation_calls[$index] eq 'h') {
3385 $counting{total_unmethylated_CHH_count}++;
3386 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3387 if ($read_identity == 1){
3388 $mbias_1{CHH}->{$index+1}->{un}++;
3389 }
3390 else{
3391 $mbias_2{CHH}->{$index+1}->{un}++;
3392 }
3393 }
3394 elsif ($methylation_calls[$index] eq '.') {}
3395 elsif (lc$methylation_calls[$index] eq 'u'){}
3396 else{
3397 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3398 }
3399 }
3400 } else {
3401 die "The strand orientation as neither + nor -: '$strand'\n";
3402 }
3403 }
3404
3405 ### strand-specific methylation output
3406 else {
3407 if ($strand eq '+') {
3408 for my $index (0..$#methylation_calls) {
3409
3410 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3411 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3412 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3413 $cigar_offset += $cigar_mod;
3414 $pos_offset += $pos_mod;
3415 }
3416
3417 ### Returning as soon as the methylation calls start overlapping
3418 if ($start+$index+$pos_offset >= $end_read_1) {
3419 return;
3420 }
3421
3422 if ($methylation_calls[$index] eq 'X') {
3423 $counting{total_meCHG_count}++;
3424 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3425 if ($read_identity == 1){
3426 $mbias_1{CHG}->{$index+1}->{meth}++;
3427 }
3428 else{
3429 $mbias_2{CHG}->{$index+1}->{meth}++;
3430 }
3431 }
3432 elsif ($methylation_calls[$index] eq 'x') {
3433 $counting{total_unmethylated_CHG_count}++;
3434 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3435 if ($read_identity == 1){
3436 $mbias_1{CHG}->{$index+1}->{un}++;
3437 }
3438 else{
3439 $mbias_2{CHG}->{$index+1}->{un}++;
3440 }
3441 }
3442 elsif ($methylation_calls[$index] eq 'Z') {
3443 $counting{total_meCpG_count}++;
3444 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3445 if ($read_identity == 1){
3446 $mbias_1{CpG}->{$index+1}->{meth}++;
3447 }
3448 else{
3449 $mbias_2{CpG}->{$index+1}->{meth}++;
3450 }
3451 }
3452 elsif ($methylation_calls[$index] eq 'z') {
3453 $counting{total_unmethylated_CpG_count}++;
3454 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3455 if ($read_identity == 1){
3456 $mbias_1{CpG}->{$index+1}->{un}++;
3457 }
3458 else{
3459 $mbias_2{CpG}->{$index+1}->{un}++;
3460 }
3461 }
3462 elsif ($methylation_calls[$index] eq 'H') {
3463 $counting{total_meCHH_count}++;
3464 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3465 if ($read_identity == 1){
3466 $mbias_1{CHH}->{$index+1}->{meth}++;
3467 }
3468 else{
3469 $mbias_2{CHH}->{$index+1}->{meth}++;
3470 }
3471 }
3472 elsif ($methylation_calls[$index] eq 'h') {
3473 $counting{total_unmethylated_CHH_count}++;
3474 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3475 if ($read_identity == 1){
3476 $mbias_1{CHH}->{$index+1}->{un}++;
3477 }
3478 else{
3479 $mbias_2{CHH}->{$index+1}->{un}++;
3480 }
3481 }
3482 elsif ($methylation_calls[$index] eq '.') {}
3483 elsif (lc$methylation_calls[$index] eq 'u'){}
3484 else{
3485 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3486 }
3487 }
3488 } elsif ($strand eq '-') {
3489 for my $index (0..$#methylation_calls) {
3490
3491 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3492 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3493 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3494 $cigar_offset += $cigar_mod;
3495 $pos_offset += $pos_mod;
3496 }
3497
3498 ### Returning as soon as the methylation calls start overlapping
3499 if ($start-$index+$pos_offset <= $end_read_1) {
3500 return;
3501 }
3502
3503 if ($methylation_calls[$index] eq 'X') {
3504 $counting{total_meCHG_count}++;
3505 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3506 if ($read_identity == 1){
3507 $mbias_1{CHG}->{$index+1}->{meth}++;
3508 }
3509 else{
3510 $mbias_2{CHG}->{$index+1}->{meth}++;
3511 }
3512 }
3513 elsif ($methylation_calls[$index] eq 'x') {
3514 $counting{total_unmethylated_CHG_count}++;
3515 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3516 if ($read_identity == 1){
3517 $mbias_1{CHG}->{$index+1}->{un}++;
3518 }
3519 else{
3520 $mbias_2{CHG}->{$index+1}->{un}++;
3521 }
3522 }
3523 elsif ($methylation_calls[$index] eq 'Z') {
3524 $counting{total_meCpG_count}++;
3525 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3526 if ($read_identity == 1){
3527 $mbias_1{CpG}->{$index+1}->{meth}++;
3528 }
3529 else{
3530 $mbias_2{CpG}->{$index+1}->{meth}++;
3531 }
3532 }
3533 elsif ($methylation_calls[$index] eq 'z') {
3534 $counting{total_unmethylated_CpG_count}++;
3535 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3536 if ($read_identity == 1){
3537 $mbias_1{CpG}->{$index+1}->{un}++;
3538 }
3539 else{
3540 $mbias_2{CpG}->{$index+1}->{un}++;
3541 }
3542 }
3543 elsif ($methylation_calls[$index] eq 'H') {
3544 $counting{total_meCHH_count}++;
3545 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3546 if ($read_identity == 1){
3547 $mbias_1{CHH}->{$index+1}->{meth}++;
3548 }
3549 else{
3550 $mbias_2{CHH}->{$index+1}->{meth}++;
3551 }
3552 }
3553 elsif ($methylation_calls[$index] eq 'h') {
3554 $counting{total_unmethylated_CHH_count}++;
3555 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3556 if ($read_identity == 1){
3557 $mbias_1{CHH}->{$index+1}->{un}++;
3558 }
3559 else{
3560 $mbias_2{CHH}->{$index+1}->{un}++;
3561 }
3562 }
3563 elsif ($methylation_calls[$index] eq '.') {}
3564 elsif (lc$methylation_calls[$index] eq 'u'){}
3565 else{
3566 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3567 }
3568 }
3569 } else {
3570 die "The strand orientation as neither + nor -: '$strand'\n";
3571 }
3572 }
3573 }
3574
3575 ### this is the default paired-end procedure allowing overlaps and using every single C position
3576 else {
3577 ### single-file CpG, CHG and CHH context output
3578 if ($full) {
3579 if ($strand eq '+') {
3580 for my $index (0..$#methylation_calls) {
3581
3582 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3583 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3584 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3585 $cigar_offset += $cigar_mod;
3586 $pos_offset += $pos_mod;
3587 }
3588
3589 if ($methylation_calls[$index] eq 'X') {
3590 $counting{total_meCHG_count}++;
3591 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3592 if ($read_identity == 1){
3593 $mbias_1{CHG}->{$index+1}->{meth}++;
3594 }
3595 else{
3596 $mbias_2{CHG}->{$index+1}->{meth}++;
3597 }
3598 }
3599 elsif ($methylation_calls[$index] eq 'x') {
3600 $counting{total_unmethylated_CHG_count}++;
3601 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3602 if ($read_identity == 1){
3603 $mbias_1{CHG}->{$index+1}->{un}++;
3604 }
3605 else{
3606 $mbias_2{CHG}->{$index+1}->{un}++;
3607 }
3608 }
3609 elsif ($methylation_calls[$index] eq 'Z') {
3610 $counting{total_meCpG_count}++;
3611 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3612 if ($read_identity == 1){
3613 $mbias_1{CpG}->{$index+1}->{meth}++;
3614 }
3615 else{
3616 $mbias_2{CpG}->{$index+1}->{meth}++;
3617 }
3618 }
3619 elsif ($methylation_calls[$index] eq 'z') {
3620 $counting{total_unmethylated_CpG_count}++;
3621 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3622 if ($read_identity == 1){
3623 $mbias_1{CpG}->{$index+1}->{un}++;
3624 }
3625 else{
3626 $mbias_2{CpG}->{$index+1}->{un}++;
3627 }
3628 }
3629 elsif ($methylation_calls[$index] eq 'H') {
3630 $counting{total_meCHH_count}++;
3631 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3632 if ($read_identity == 1){
3633 $mbias_1{CHH}->{$index+1}->{meth}++;
3634 }
3635 else{
3636 $mbias_2{CHH}->{$index+1}->{meth}++;
3637 }
3638 }
3639 elsif ($methylation_calls[$index] eq 'h') {
3640 $counting{total_unmethylated_CHH_count}++;
3641 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3642 if ($read_identity == 1){
3643 $mbias_1{CHH}->{$index+1}->{un}++;
3644 }
3645 else{
3646 $mbias_2{CHH}->{$index+1}->{un}++;
3647 }
3648 }
3649 elsif ($methylation_calls[$index] eq '.') {}
3650 elsif (lc$methylation_calls[$index] eq 'u'){}
3651 else{
3652 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3653 }
3654 }
3655 } elsif ($strand eq '-') {
3656 for my $index (0..$#methylation_calls) {
3657
3658 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3659 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3660 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3661 $cigar_offset += $cigar_mod;
3662 $pos_offset += $pos_mod;
3663 }
3664
3665 if ($methylation_calls[$index] eq 'X') {
3666 $counting{total_meCHG_count}++;
3667 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3668 if ($read_identity == 1){
3669 $mbias_1{CHG}->{$index+1}->{meth}++;
3670 }
3671 else{
3672 $mbias_2{CHG}->{$index+1}->{meth}++;
3673 }
3674 }
3675 elsif ($methylation_calls[$index] eq 'x') {
3676 $counting{total_unmethylated_CHG_count}++;
3677 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3678 if ($read_identity == 1){
3679 $mbias_1{CHG}->{$index+1}->{un}++;
3680 }
3681 else{
3682 $mbias_2{CHG}->{$index+1}->{un}++;
3683 }
3684 }
3685 elsif ($methylation_calls[$index] eq 'Z') {
3686 $counting{total_meCpG_count}++;
3687 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3688 if ($read_identity == 1){
3689 $mbias_1{CpG}->{$index+1}->{meth}++;
3690 }
3691 else{
3692 $mbias_2{CpG}->{$index+1}->{meth}++;
3693 }
3694 }
3695 elsif ($methylation_calls[$index] eq 'z') {
3696 $counting{total_unmethylated_CpG_count}++;
3697 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3698 if ($read_identity == 1){
3699 $mbias_1{CpG}->{$index+1}->{un}++;
3700 }
3701 else{
3702 $mbias_2{CpG}->{$index+1}->{un}++;
3703 }
3704 }
3705 elsif ($methylation_calls[$index] eq 'H') {
3706 $counting{total_meCHH_count}++;
3707 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3708 if ($read_identity == 1){
3709 $mbias_1{CHH}->{$index+1}->{meth}++;
3710 }
3711 else{
3712 $mbias_2{CHH}->{$index+1}->{meth}++;
3713 }
3714 }
3715 elsif ($methylation_calls[$index] eq 'h') {
3716 $counting{total_unmethylated_CHH_count}++;
3717 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3718 if ($read_identity == 1){
3719 $mbias_1{CHH}->{$index+1}->{un}++;
3720 }
3721 else{
3722 $mbias_2{CHH}->{$index+1}->{un}++;
3723 }
3724 }
3725 elsif ($methylation_calls[$index] eq '.') {}
3726 elsif (lc$methylation_calls[$index] eq 'u'){}
3727 else{
3728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3729 }
3730 }
3731 } else {
3732 die "The strand orientation as neither + nor -: '$strand'\n";
3733 }
3734 }
3735
3736 ### strand-specific methylation output
3737 else {
3738 if ($strand eq '+') {
3739 for my $index (0..$#methylation_calls) {
3740
3741 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3742 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3743 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
3744 $cigar_offset += $cigar_mod;
3745 $pos_offset += $pos_mod;
3746 }
3747
3748 if ($methylation_calls[$index] eq 'X') {
3749 $counting{total_meCHG_count}++;
3750 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3751 if ($read_identity == 1){
3752 $mbias_1{CHG}->{$index+1}->{meth}++;
3753 }
3754 else{
3755 $mbias_2{CHG}->{$index+1}->{meth}++;
3756 }
3757 }
3758 elsif ($methylation_calls[$index] eq 'x') {
3759 $counting{total_unmethylated_CHG_count}++;
3760 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3761 if ($read_identity == 1){
3762 $mbias_1{CHG}->{$index+1}->{un}++;
3763 }
3764 else{
3765 $mbias_2{CHG}->{$index+1}->{un}++;
3766 }
3767 }
3768 elsif ($methylation_calls[$index] eq 'Z') {
3769 $counting{total_meCpG_count}++;
3770 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3771 if ($read_identity == 1){
3772 $mbias_1{CpG}->{$index+1}->{meth}++;
3773 }
3774 else{
3775 $mbias_2{CpG}->{$index+1}->{meth}++;
3776 }
3777 }
3778 elsif ($methylation_calls[$index] eq 'z') {
3779 $counting{total_unmethylated_CpG_count}++;
3780 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3781 if ($read_identity == 1){
3782 $mbias_1{CpG}->{$index+1}->{un}++;
3783 }
3784 else{
3785 $mbias_2{CpG}->{$index+1}->{un}++;
3786 }
3787 }
3788 elsif ($methylation_calls[$index] eq 'H') {
3789 $counting{total_meCHH_count}++;
3790 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3791 if ($read_identity == 1){
3792 $mbias_1{CHH}->{$index+1}->{meth}++;
3793 }
3794 else{
3795 $mbias_2{CHH}->{$index+1}->{meth}++;
3796 }
3797 }
3798 elsif ($methylation_calls[$index] eq 'h') {
3799 $counting{total_unmethylated_CHH_count}++;
3800 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3801 if ($read_identity == 1){
3802 $mbias_1{CHH}->{$index+1}->{un}++;
3803 }
3804 else{
3805 $mbias_2{CHH}->{$index+1}->{un}++;
3806 }
3807 }
3808 elsif ($methylation_calls[$index] eq '.') {}
3809 elsif (lc$methylation_calls[$index] eq 'u'){}
3810 else{
3811 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3812 }
3813 }
3814 } elsif ($strand eq '-') {
3815 for my $index (0..$#methylation_calls) {
3816
3817 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
3818 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
3819 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
3820 $cigar_offset += $cigar_mod;
3821 $pos_offset += $pos_mod;
3822 }
3823
3824 if ($methylation_calls[$index] eq 'X') {
3825 $counting{total_meCHG_count}++;
3826 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3827 if ($read_identity == 1){
3828 $mbias_1{CHG}->{$index+1}->{meth}++;
3829 }
3830 else{
3831 $mbias_2{CHG}->{$index+1}->{meth}++;
3832 }
3833 }
3834 elsif ($methylation_calls[$index] eq 'x') {
3835 $counting{total_unmethylated_CHG_count}++;
3836 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3837 if ($read_identity == 1){
3838 $mbias_1{CHG}->{$index+1}->{un}++;
3839 }
3840 else{
3841 $mbias_2{CHG}->{$index+1}->{un}++;
3842 }
3843 }
3844 elsif ($methylation_calls[$index] eq 'Z') {
3845 $counting{total_meCpG_count}++;
3846 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3847 if ($read_identity == 1){
3848 $mbias_1{CpG}->{$index+1}->{meth}++;
3849 }
3850 else{
3851 $mbias_2{CpG}->{$index+1}->{meth}++;
3852 }
3853 }
3854 elsif ($methylation_calls[$index] eq 'z') {
3855 $counting{total_unmethylated_CpG_count}++;
3856 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3857 if ($read_identity == 1){
3858 $mbias_1{CpG}->{$index+1}->{un}++;
3859 }
3860 else{
3861 $mbias_2{CpG}->{$index+1}->{un}++;
3862 }
3863 }
3864 elsif ($methylation_calls[$index] eq 'H') {
3865 $counting{total_meCHH_count}++;
3866 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3867 if ($read_identity == 1){
3868 $mbias_1{CHH}->{$index+1}->{meth}++;
3869 }
3870 else{
3871 $mbias_2{CHH}->{$index+1}->{meth}++;
3872 }
3873 }
3874 elsif ($methylation_calls[$index] eq 'h') {
3875 $counting{total_unmethylated_CHH_count}++;
3876 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
3877 if ($read_identity == 1){
3878 $mbias_1{CHH}->{$index+1}->{un}++;
3879 }
3880 else{
3881 $mbias_2{CHH}->{$index+1}->{un}++;
3882 }
3883 }
3884 elsif ($methylation_calls[$index] eq '.') {}
3885 elsif (lc$methylation_calls[$index] eq 'u'){}
3886 else{
3887 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
3888 }
3889 }
3890 } else {
3891 die "The strand orientation as neither + nor -: '$strand'\n";
3892 }
3893 }
3894 }
3895 }
3896
3897 sub check_cigar_string {
3898 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
3899 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
3900 my ($new_cigar_offset,$new_pos_offset) = (0,0);
3901
3902 if ($strand eq '+') {
3903 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
3904
3905 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
3906 # warn "position needs no adjustment\n";
3907 }
3908
3909 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
3910 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
3911 # warn "adjusted genomic position by -1 bp (insertion)\n";
3912 }
3913
3914 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
3915 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
3916 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
3917 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
3918
3919 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
3920 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
3921 # warn "position needs no adjustment\n";
3922 last;
3923 }
3924 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
3925 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
3926 # warn "adjusted genomic position by another -1 bp (insertion)\n";
3927 last;
3928 }
3929 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
3930 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
3931 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
3932 # warn "adjusted genomic position by another +1 bp (deletion)\n";
3933 }
3934 else{
3935 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
3936 }
3937 }
3938 }
3939 else{
3940 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
3941 }
3942 }
3943
3944 elsif ($strand eq '-') {
3945 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
3946
3947 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
3948 # warn "position needs no adjustment\n";
3949 }
3950
3951 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
3952 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
3953 # warn "adjusted genomic position by +1 bp (insertion)\n";
3954 }
3955
3956 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
3957 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
3958 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
3959 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
3960
3961 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
3962 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
3963 # warn "Found new 'M' operation; position needs no adjustment\n";
3964 last;
3965 }
3966 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
3967 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
3968 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
3969 last;
3970 }
3971 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
3972 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
3973 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
3974 # warn "adjusted genomic position by another -1 bp (deletion)\n";
3975 }
3976 else{
3977 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
3978 }
3979 }
3980 }
3981 else{
3982 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
3983 }
3984 }
3985 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
3986 return ($new_cigar_offset,$new_pos_offset);
3987 }
3988
3989 sub print_individual_C_methylation_states_single_end{
3990
3991 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
3992 my @methylation_calls = split(//,$meth_call);
3993
3994 #################################################################
3995 ### . for bases not involving cytosines ###
3996 ### X for methylated C in CHG context (was protected) ###
3997 ### x for not methylated C in CHG context (was converted) ###
3998 ### H for methylated C in CHH context (was protected) ###
3999 ### h for not methylated C in CHH context (was converted) ###
4000 ### Z for methylated C in CpG context (was protected) ###
4001 ### z for not methylated C in CpG context (was converted) ###
4002 #################################################################
4003
4004 my $methyl_CHG_count = 0;
4005 my $methyl_CHH_count = 0;
4006 my $methyl_CpG_count = 0;
4007 my $unmethylated_CHG_count = 0;
4008 my $unmethylated_CHH_count = 0;
4009 my $unmethylated_CpG_count = 0;
4010
4011 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
4012 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
4013
4014 my @comp_cigar;
4015
4016 if ($cigar){ # parsing CIGAR string
4017
4018 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
4019 if ($cigar =~ /^\d+M$/){
4020 # warn "See!? I told you so! $cigar\n";
4021 # sleep(1);
4022 }
4023 else{
4024
4025 my @len;
4026 my @ops;
4027
4028 @len = split (/\D+/,$cigar); # storing the length per operation
4029 @ops = split (/\d+/,$cigar); # storing the operation
4030 shift @ops; # remove the empty first element
4031 # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops);
4032 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
4033
4034 foreach my $index (0..$#len){
4035 foreach (1..$len[$index]){
4036 # print "$ops[$index]";
4037 push @comp_cigar, $ops[$index];
4038 }
4039 }
4040 }
4041 # warn "\nDetected CIGAR string: $cigar\n";
4042 # warn "Length of methylation call: ",length $meth_call,"\n";
4043 # warn "number of operations: ",scalar @ops,"\n";
4044 # warn "number of length digits: ",scalar @len,"\n\n";
4045 # print @comp_cigar,"\n";
4046 # print "$meth_call\n\n";
4047 # sleep (1);
4048 }
4049
4050 ### adjusting the start position for all reads mapping to the reverse strand
4051 if ($strand eq '-') {
4052
4053 if (@comp_cigar){ # only needed for SAM reads with InDels
4054 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
4055 # print @comp_cigar,"\n";
4056 }
4057
4058 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
4059
4060 if ($cigar){ ### SAM format
4061 if ($cigar =~ /^(\d+)M$/){ # linear match
4062 $start += $1 - 1;
4063 }
4064 else{ # InDel read
4065 my $MD_count = 0;
4066 foreach (@comp_cigar){
4067 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
4068 }
4069 $start += $MD_count - 1;
4070 }
4071 }
4072 else{ ### vanilla format
4073 $start += length($meth_call)-1;
4074 }
4075 }
4076 }
4077
4078 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
4079
4080 ### single-file CpG and other-context output
4081 if ($full and $merge_non_CpG) {
4082 if ($strand eq '+') {
4083 for my $index (0..$#methylation_calls) {
4084
4085 if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels
4086 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4087 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
4088 $cigar_offset += $cigar_mod;
4089 $pos_offset += $pos_mod;
4090 }
4091
4092 ### methylated Cs (any context) will receive a forward (+) orientation
4093 ### not methylated Cs (any context) will receive a reverse (-) orientation
4094 if ($methylation_calls[$index] eq 'X') {
4095 $counting{total_meCHG_count}++;
4096 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4097 $mbias_1{CHG}->{$index+1}->{meth}++;
4098 }
4099 elsif ($methylation_calls[$index] eq 'x') {
4100 $counting{total_unmethylated_CHG_count}++;
4101 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4102 $mbias_1{CHG}->{$index+1}->{un}++;
4103 }
4104 elsif ($methylation_calls[$index] eq 'Z') {
4105 $counting{total_meCpG_count}++;
4106 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4107 $mbias_1{CpG}->{$index+1}->{meth}++;
4108 }
4109 elsif ($methylation_calls[$index] eq 'z') {
4110 $counting{total_unmethylated_CpG_count}++;
4111 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4112 $mbias_1{CpG}->{$index+1}->{un}++;
4113 }
4114 elsif ($methylation_calls[$index] eq 'H') {
4115 $counting{total_meCHH_count}++;
4116 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4117 $mbias_1{CHH}->{$index+1}->{meth}++;
4118 }
4119 elsif ($methylation_calls[$index] eq 'h') {
4120 $counting{total_unmethylated_CHH_count}++;
4121 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4122 $mbias_1{CHH}->{$index+1}->{un}++;
4123 }
4124 elsif ($methylation_calls[$index] eq '.') {}
4125 elsif (lc$methylation_calls[$index] eq 'u'){}
4126 else{
4127 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4128 }
4129 }
4130 }
4131 elsif ($strand eq '-') {
4132
4133 for my $index (0..$#methylation_calls) {
4134 ### methylated Cs (any context) will receive a forward (+) orientation
4135 ### not methylated Cs (any context) will receive a reverse (-) orientation
4136
4137 if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels
4138 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
4139 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4140 $cigar_offset += $cigar_mod;
4141 $pos_offset += $pos_mod;
4142 }
4143
4144 if ($methylation_calls[$index] eq 'X') {
4145 $counting{total_meCHG_count}++;
4146 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4147 $mbias_1{CHG}->{$index+1}->{meth}++;
4148 }
4149 elsif ($methylation_calls[$index] eq 'x') {
4150 $counting{total_unmethylated_CHG_count}++;
4151 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4152 $mbias_1{CHG}->{$index+1}->{un}++;
4153 }
4154 elsif ($methylation_calls[$index] eq 'Z') {
4155 $counting{total_meCpG_count}++;
4156 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4157 $mbias_1{CpG}->{$index+1}->{meth}++;
4158 }
4159 elsif ($methylation_calls[$index] eq 'z') {
4160 $counting{total_unmethylated_CpG_count}++;
4161 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4162 $mbias_1{CpG}->{$index+1}->{un}++;
4163 }
4164 elsif ($methylation_calls[$index] eq 'H') {
4165 $counting{total_meCHH_count}++;
4166 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4167 $mbias_1{CHH}->{$index+1}->{meth}++;
4168 }
4169 elsif ($methylation_calls[$index] eq 'h') {
4170 $counting{total_unmethylated_CHH_count}++;
4171 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4172 $mbias_1{CHH}->{$index+1}->{un}++;
4173 }
4174 elsif ($methylation_calls[$index] eq '.'){}
4175 elsif (lc$methylation_calls[$index] eq 'u'){}
4176 else{
4177 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4178 }
4179 }
4180 }
4181 else {
4182 die "The strand information was neither + nor -: $strand\n";
4183 }
4184 }
4185
4186 ### strand-specific methylation output
4187 elsif ($merge_non_CpG) {
4188 if ($strand eq '+') {
4189 for my $index (0..$#methylation_calls) {
4190 ### methylated Cs (any context) will receive a forward (+) orientation
4191 ### not methylated Cs (any context) will receive a reverse (-) orientation
4192
4193 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4194 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4195 $cigar_offset += $cigar_mod;
4196 $pos_offset += $pos_mod;
4197 }
4198
4199 if ($methylation_calls[$index] eq 'X') {
4200 $counting{total_meCHG_count}++;
4201 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4202 $mbias_1{CHG}->{$index+1}->{meth}++;
4203 }
4204 elsif ($methylation_calls[$index] eq 'x') {
4205 $counting{total_unmethylated_CHG_count}++;
4206 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4207 $mbias_1{CHG}->{$index+1}->{un}++;
4208 }
4209 elsif ($methylation_calls[$index] eq 'Z') {
4210 $counting{total_meCpG_count}++;
4211 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4212 $mbias_1{CpG}->{$index+1}->{meth}++;
4213 }
4214 elsif ($methylation_calls[$index] eq 'z') {
4215 $counting{total_unmethylated_CpG_count}++;
4216 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4217 $mbias_1{CpG}->{$index+1}->{un}++;
4218 }
4219 elsif ($methylation_calls[$index] eq 'H') {
4220 $counting{total_meCHH_count}++;
4221 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4222 $mbias_1{CHH}->{$index+1}->{meth}++;
4223 }
4224 elsif ($methylation_calls[$index] eq 'h') {
4225 $counting{total_unmethylated_CHH_count}++;
4226 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4227 $mbias_1{CHH}->{$index+1}->{un}++;
4228 }
4229 elsif ($methylation_calls[$index] eq '.') {}
4230 elsif (lc$methylation_calls[$index] eq 'u'){}
4231 else{
4232 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4233 }
4234 }
4235 }
4236 elsif ($strand eq '-') {
4237
4238 for my $index (0..$#methylation_calls) {
4239 ### methylated Cs (any context) will receive a forward (+) orientation
4240 ### not methylated Cs (any context) will receive a reverse (-) orientation
4241
4242 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4243 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4244 $cigar_offset += $cigar_mod;
4245 $pos_offset += $pos_mod;
4246 }
4247
4248 if ($methylation_calls[$index] eq 'X') {
4249 $counting{total_meCHG_count}++;
4250 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4251 $mbias_1{CHG}->{$index+1}->{meth}++;
4252 }
4253 elsif ($methylation_calls[$index] eq 'x') {
4254 $counting{total_unmethylated_CHG_count}++;
4255 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4256 $mbias_1{CHG}->{$index+1}->{un}++;
4257 }
4258 elsif ($methylation_calls[$index] eq 'Z') {
4259 $counting{total_meCpG_count}++;
4260 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4261 $mbias_1{CpG}->{$index+1}->{meth}++;
4262 }
4263 elsif ($methylation_calls[$index] eq 'z') {
4264 $counting{total_unmethylated_CpG_count}++;
4265 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4266 $mbias_1{CpG}->{$index+1}->{un}++;
4267 }
4268 elsif ($methylation_calls[$index] eq 'H') {
4269 $counting{total_meCHH_count}++;
4270 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4271 $mbias_1{CHH}->{$index+1}->{meth}++;
4272 }
4273 elsif ($methylation_calls[$index] eq 'h') {
4274 $counting{total_unmethylated_CHH_count}++;
4275 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4276 $mbias_1{CHH}->{$index+1}->{un}++;
4277 }
4278 elsif ($methylation_calls[$index] eq '.') {}
4279 elsif (lc$methylation_calls[$index] eq 'u'){}
4280 else{
4281 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4282 }
4283 }
4284 }
4285 else {
4286 die "The strand information was neither + nor -: $strand\n";
4287 }
4288 }
4289
4290 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
4291
4292 elsif ($full) {
4293 if ($strand eq '+') {
4294 for my $index (0..$#methylation_calls) {
4295 ### methylated Cs (any context) will receive a forward (+) orientation
4296 ### not methylated Cs (any context) will receive a reverse (-) orientation
4297
4298 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4299 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4300 $cigar_offset += $cigar_mod;
4301 $pos_offset += $pos_mod;
4302 }
4303
4304 if ($methylation_calls[$index] eq 'X') {
4305 $counting{total_meCHG_count}++;
4306 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4307 $mbias_1{CHG}->{$index+1}->{meth}++;
4308 }
4309 elsif ($methylation_calls[$index] eq 'x') {
4310 $counting{total_unmethylated_CHG_count}++;
4311 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4312 $mbias_1{CHG}->{$index+1}->{un}++;
4313 }
4314 elsif ($methylation_calls[$index] eq 'Z') {
4315 $counting{total_meCpG_count}++;
4316 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4317 $mbias_1{CpG}->{$index+1}->{meth}++;
4318 }
4319 elsif ($methylation_calls[$index] eq 'z') {
4320 $counting{total_unmethylated_CpG_count}++;
4321 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4322 $mbias_1{CpG}->{$index+1}->{un}++;
4323 }
4324 elsif ($methylation_calls[$index] eq 'H') {
4325 $counting{total_meCHH_count}++;
4326 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4327 $mbias_1{CHH}->{$index+1}->{meth}++;
4328 }
4329 elsif ($methylation_calls[$index] eq 'h') {
4330 $counting{total_unmethylated_CHH_count}++;
4331 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4332 $mbias_1{CHH}->{$index+1}->{un}++;
4333 }
4334 elsif ($methylation_calls[$index] eq '.') {}
4335 elsif (lc$methylation_calls[$index] eq 'u'){}
4336 else{
4337 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
4338 }
4339 }
4340 }
4341 elsif ($strand eq '-') {
4342
4343 for my $index (0..$#methylation_calls) {
4344 ### methylated Cs (any context) will receive a forward (+) orientation
4345 ### not methylated Cs (any context) will receive a reverse (-) orientation
4346
4347 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4348 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4349 $cigar_offset += $cigar_mod;
4350 $pos_offset += $pos_mod;
4351 }
4352
4353 if ($methylation_calls[$index] eq 'X') {
4354 $counting{total_meCHG_count}++;
4355 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4356 $mbias_1{CHG}->{$index+1}->{meth}++;
4357 }
4358 elsif ($methylation_calls[$index] eq 'x') {
4359 $counting{total_unmethylated_CHG_count}++;
4360 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4361 $mbias_1{CHG}->{$index+1}->{un}++;
4362 }
4363 elsif ($methylation_calls[$index] eq 'Z') {
4364 $counting{total_meCpG_count}++;
4365 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4366 $mbias_1{CpG}->{$index+1}->{meth}++;
4367 }
4368 elsif ($methylation_calls[$index] eq 'z') {
4369 $counting{total_unmethylated_CpG_count}++;
4370 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4371 $mbias_1{CpG}->{$index+1}->{un}++;
4372 }
4373 elsif ($methylation_calls[$index] eq 'H') {
4374 $counting{total_meCHH_count}++;
4375 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4376 $mbias_1{CHH}->{$index+1}->{meth}++;
4377 }
4378 elsif ($methylation_calls[$index] eq 'h') {
4379 $counting{total_unmethylated_CHH_count}++;
4380 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4381 $mbias_1{CHH}->{$index+1}->{un}++;
4382 }
4383 elsif ($methylation_calls[$index] eq '.') {}
4384 elsif (lc$methylation_calls[$index] eq 'u'){}
4385 else{
4386 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4387 }
4388 }
4389 }
4390 else {
4391 die "The read had a strand orientation which was neither + nor -: $strand\n";
4392 }
4393 }
4394
4395 ### strand-specific methylation output
4396 else {
4397 if ($strand eq '+') {
4398 for my $index (0..$#methylation_calls) {
4399 ### methylated Cs (any context) will receive a forward (+) orientation
4400 ### not methylated Cs (any context) will receive a reverse (-) orientation
4401
4402 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4403 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4404 $cigar_offset += $cigar_mod;
4405 $pos_offset += $pos_mod;
4406 }
4407
4408 if ($methylation_calls[$index] eq 'X') {
4409 $counting{total_meCHG_count}++;
4410 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4411 $mbias_1{CHG}->{$index+1}->{meth}++;
4412 }
4413 elsif ($methylation_calls[$index] eq 'x') {
4414 $counting{total_unmethylated_CHG_count}++;
4415 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4416 $mbias_1{CHG}->{$index+1}->{un}++;
4417 }
4418 elsif ($methylation_calls[$index] eq 'Z') {
4419 $counting{total_meCpG_count}++;
4420 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4421 $mbias_1{CpG}->{$index+1}->{meth}++;
4422 }
4423 elsif ($methylation_calls[$index] eq 'z') {
4424 $counting{total_unmethylated_CpG_count}++;
4425 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4426 $mbias_1{CpG}->{$index+1}->{un}++;
4427 }
4428 elsif ($methylation_calls[$index] eq 'H') {
4429 $counting{total_meCHH_count}++;
4430 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4431 $mbias_1{CHH}->{$index+1}->{meth}++;
4432 }
4433 elsif ($methylation_calls[$index] eq 'h') {
4434 $counting{total_unmethylated_CHH_count}++;
4435 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4436 $mbias_1{CHH}->{$index+1}->{un}++;
4437 }
4438 elsif ($methylation_calls[$index] eq '.') {}
4439 elsif (lc$methylation_calls[$index] eq 'u'){}
4440 else{
4441 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4442 }
4443 }
4444 }
4445 elsif ($strand eq '-') {
4446
4447 for my $index (0..$#methylation_calls) {
4448 ### methylated Cs (any context) will receive a forward (+) orientation
4449 ### not methylated Cs (any context) will receive a reverse (-) orientation
4450
4451 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
4452 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
4453 $cigar_offset += $cigar_mod;
4454 $pos_offset += $pos_mod;
4455 }
4456
4457 if ($methylation_calls[$index] eq 'X') {
4458 $counting{total_meCHG_count}++;
4459 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4460 $mbias_1{CHG}->{$index+1}->{meth}++;
4461 }
4462 elsif ($methylation_calls[$index] eq 'x') {
4463 $counting{total_unmethylated_CHG_count}++;
4464 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4465 $mbias_1{CHG}->{$index+1}->{un}++;
4466 }
4467 elsif ($methylation_calls[$index] eq 'Z') {
4468 $counting{total_meCpG_count}++;
4469 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4470 $mbias_1{CpG}->{$index+1}->{meth}++;
4471 }
4472 elsif ($methylation_calls[$index] eq 'z') {
4473 $counting{total_unmethylated_CpG_count}++;
4474 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4475 $mbias_1{CpG}->{$index+1}->{un}++;
4476 }
4477 elsif ($methylation_calls[$index] eq 'H') {
4478 $counting{total_meCHH_count}++;
4479 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4480 $mbias_1{CHH}->{$index+1}->{meth}++;
4481 }
4482 elsif ($methylation_calls[$index] eq 'h') {
4483 $counting{total_unmethylated_CHH_count}++;
4484 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
4485 $mbias_1{CHH}->{$index+1}->{un}++;
4486 }
4487 elsif ($methylation_calls[$index] eq '.') {}
4488 elsif (lc$methylation_calls[$index] eq 'u'){}
4489 else{
4490 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
4491 }
4492 }
4493 }
4494 else {
4495 die "The strand information was neither + nor -: $strand\n";
4496 }
4497 }
4498 }
4499
4500
4501
4502 sub print_helpfile{
4503
4504 print << 'HOW_TO';
4505
4506
4507 DESCRIPTION
4508
4509 The following is a brief description of all options to control the Bismark
4510 methylation extractor. The script reads in a bisulfite read alignment results file
4511 produced by the Bismark bisulfite mapper and extracts the methylation information
4512 for individual cytosines. This information is found in the methylation call field
4513 which can contain the following characters:
4514
4515 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4516 ~~~ X for methylated C in CHG context ~~~
4517 ~~~ x for not methylated C CHG ~~~
4518 ~~~ H for methylated C in CHH context ~~~
4519 ~~~ h for not methylated C in CHH context ~~~
4520 ~~~ Z for methylated C in CpG context ~~~
4521 ~~~ z for not methylated C in CpG context ~~~
4522 ~~~ U for methylated C in Unknown context (CN or CHN ~~~
4523 ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~
4524 ~~~ . for any bases not involving cytosines ~~~
4525 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4526
4527 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
4528 context (this distinction is actually already made in Bismark itself). As the methylation
4529 information for every C analysed can produce files which easily have tens or even hundreds of
4530 millions of lines, file sizes can become very large and more difficult to handle. The C
4531 methylation info additionally splits cytosine methylation calls up into one of the four possible
4532 strands a given bisulfite read aligned against:
4533
4534 OT original top strand
4535 CTOT complementary to original top strand
4536
4537 OB original bottom strand
4538 CTOB complementary to original bottom strand
4539
4540 Thus, by default twelve individual output files are being generated per input file (unless
4541 --comprehensive is specified, see below). The output files can be imported into a genome
4542 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
4543 unless the bisulfite reads were generated preserving directionality it doesn't make any
4544 sense to look at the data in a strand-specific manner). Strand-specific output files can
4545 optionally be skipped, in which case only three output files for CpG, CHG or CHH context
4546 will be generated. For both the strand-specific and comprehensive outputs there is also
4547 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
4548
4549
4550 The output files are in the following format (tab delimited):
4551
4552 <sequence_id> <strand> <chromosome> <position> <methylation call>
4553
4554
4555 USAGE: methylation_extractor [options] <filenames>
4556
4557
4558 ARGUMENTS:
4559 ==========
4560
4561 <filenames> A space-separated list of Bismark result files in SAM format from
4562 which methylation information is extracted for every cytosine in
4563 the reads. For alignment files in the older custom Bismark output
4564 see option '--vanilla'.
4565
4566 OPTIONS:
4567
4568 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end
4569 read data. Specifying either --single-end or --paired-end is
4570 mandatory.
4571
4572 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
4573 read data. Specifying either --paired-end or --single-end is
4574 mandatory.
4575
4576 --vanilla The Bismark result input file(s) are in the old custom Bismark format
4577 (up to version 0.5.x) and not in SAM format which is the default as
4578 of Bismark version 0.6.x or higher. Default: OFF.
4579
4580 --no_overlap For paired-end reads it is theoretically possible that read_1 and
4581 read_2 overlap. This option avoids scoring overlapping methylation
4582 calls twice (only methylation calls of read 1 are used for in the process
4583 since read 1 has historically higher quality basecalls than read 2).
4584 Whilst this option removes a bias towards more methylation calls
4585 in the center of sequenced fragments it may de facto remove a sizable
4586 proportion of the data. This option is highly recommended for paired-end
4587 data.
4588
4589 --ignore <int> Ignore the first <int> bp from the 5' end of Read 1 when processing the
4590 methylation call string. This can remove e.g. a restriction enzyme site
4591 at the start of each read or any other source of bias (e.g. PBAT-Seq data).
4592
4593 --ignore_r2 <int> Ignore the first <int> bp from the 5' end of Read 2 of paired-end sequencing
4594 results only. Since the first couple of bases in Read 2 of BS-Seq experiments
4595 show a severe bias towards non-methylation as a result of end-repairing
4596 sonicated fragments with unmethylated cytosines (see M-bias plot), it is
4597 recommended that the first couple of bp of Read 2 are removed before
4598 starting downstream analysis. Please see the section on M-bias plots in the
4599 Bismark User Guide for more details.
4600
4601 --comprehensive Specifying this option will merge all four possible strand-specific
4602 methylation info into context-dependent output files. The default
4603
4604 contexts are:
4605 - CpG context
4606 - CHG context
4607 - CHH context
4608
4609 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
4610 strand-specific output files (default) for Cs in
4611 - CpG context
4612 - non-CpG context
4613
4614 --report Prints out a short methylation summary as well as the paramaters used to run
4615 this script.
4616
4617 --no_header Suppresses the Bismark version header line in all output files for more convenient
4618 batch processing.
4619
4620 -o/--output DIR Allows specification of a different output directory (absolute or relative
4621 path). If not specified explicitely, the output will be written to the current directory.
4622
4623 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
4624 explicitly if Samtools is in the PATH already.
4625
4626 --gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in
4627 a GZIP compressed form to save disk space. This option does not work on bedGraph and
4628 genome-wide cytosine reports as they are 'tiny' anyway.
4629
4630 --version Displays version information.
4631
4632 -h/--help Displays this help file and exits.
4633
4634 --mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as
4635 well as a report (optional) and then quit. Default: OFF.
4636
4637
4638
4639 bedGraph specific options:
4640 ==========================
4641
4642 --bedGraph After finishing the methylation extraction, the methylation output is written into a
4643 sorted bedGraph file that reports the position of a given cytosine and its methylation
4644 state (in %, see details below). The methylation extractor output is temporarily split up into
4645 temporary files, one per chromosome (written into the current directory or folder
4646 specified with -o/--output); these temp files are then used for sorting and deleted
4647 afterwards. By default, only cytosines in CpG context will be sorted. The option
4648 '--CX_context' may be used to report all cytosines irrespective of sequence context
4649 (this will take MUCH longer!). The default folder for temporary files during the sorting
4650 process is the output directory. The bedGraph conversion step is performed by the external
4651 module 'bismark2bedGraph'; this script needs to reside in the same folder as the
4652 bismark_methylation_extractor itself.
4653
4654
4655 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide
4656 before its methylation percentage is reported. Default: 1.
4657
4658 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
4659
4660
4661 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
4662 in the experiment irrespective of its sequence context. This applies to both forward and
4663 reverse strands. Please be aware that this option may generate large temporary and output files
4664 and may take a long time to sort (up to many hours). Default: OFF.
4665 (i.e. Default = CpG context only).
4666
4667 --buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information.
4668 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
4669 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
4670 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
4671 Defaults to 2G.
4672
4673 --scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of
4674 scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to
4675 individual chromosome files. These errors were caused by the operating system's limit
4676 of the number of filehandle that can be written to at any one time (typically 1024; to
4677 find out this limit on Linux, type: ulimit -a).
4678 To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort
4679 methylation calls into individual chromosome files. Instead, all input files are
4680 temporarily merged into a single file (unless there is only a single file), and this
4681 file will then be sorted by both chromosome AND position using the Unix sort command.
4682 Please be aware that this option might take a looooong time to complete, depending on
4683 the size of the input files, and the memory you allocate to this process (see --buffer_size).
4684 Nevertheless, it seems to be working.
4685
4686 --ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will
4687 instead use two arrays to sort methylated and unmethylated calls. This may result in a faster
4688 sorting process of very large files, but this comes at the cost of a larger memory footprint
4689 (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB
4690 of RAM). Due to overheads in creating and looping through these arrays it seems that it will
4691 actually be *slower* for small files (few million alignments), and we are currently testing at
4692 which point it is advisable to use this option. Note that --ample_memory is not compatible
4693 with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with).
4694
4695
4696
4697 Genome-wide cytosine methylation report specific options:
4698 =========================================================
4699
4700 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a
4701 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based
4702 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all
4703 cytosine context is optional). The output considers all Cs on both forward and reverse strands and
4704 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not
4705 covered). The cytsoine report conversion step is performed by the external module
4706 'bedGraph2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor
4707 itself.
4708
4709 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of
4710 its context. This applies to both forward and reverse strands. Please be aware that this will
4711 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
4712 Default: OFF (i.e. Default = CpG context only).
4713
4714 --zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF.
4715
4716 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
4717 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
4718
4719 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files
4720 will be named to include the input filename and the chromosome number.
4721
4722
4723
4724 OUTPUT:
4725
4726 The bismark_methylation_extractor output is in the form:
4727 ========================================================
4728 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call>
4729
4730 * Methylated cytosines receive a '+' orientation,
4731 * Unmethylated cytosines receive a '-' orientation.
4732
4733
4734
4735 The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords):
4736 =========================================================================================================
4737
4738 track type=bedGraph (header line)
4739
4740 <chromosome> <start position> <end position> <methylation percentage>
4741
4742
4743
4744 The coverage output looks like this (tab-delimited, 1-based genomic coords):
4745 ============================================================================
4746
4747 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated>
4748
4749
4750
4751 The genome-wide cytosine methylation output file is tab-delimited in the following format:
4752 ==========================================================================================
4753 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context>
4754
4755
4756
4757 This script was last modified on 25 November 2013.
4758
4759 HOW_TO
4760 }