Mercurial > repos > bgruening > bismark
comparison old/bismark_methylation_extractor @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
| author | bgruening |
|---|---|
| date | Sat, 06 May 2017 13:18:09 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 6:0f8646f22b8d | 7:fcadce4d9a06 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 use warnings; | |
| 3 use strict; | |
| 4 $|++; | |
| 5 use Getopt::Long; | |
| 6 use Cwd; | |
| 7 use Carp; | |
| 8 use FindBin qw($Bin); | |
| 9 use lib "$Bin/../lib"; | |
| 10 | |
| 11 | |
| 12 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) | |
| 13 | |
| 14 ## This program is free software: you can redistribute it and/or modify | |
| 15 ## it under the terms of the GNU General Public License as published by | |
| 16 ## the Free Software Foundation, either version 3 of the License, or | |
| 17 ## (at your option) any later version. | |
| 18 | |
| 19 ## This program is distributed in the hope that it will be useful, | |
| 20 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 21 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 22 ## GNU General Public License for more details. | |
| 23 | |
| 24 ## You should have received a copy of the GNU General Public License | |
| 25 ## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 26 | |
| 27 my @filenames; # input files | |
| 28 my %counting; | |
| 29 my $parent_dir = getcwd(); | |
| 30 | |
| 31 my %fhs; | |
| 32 | |
| 33 my $version = 'v0.10.1'; | |
| 34 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem) = process_commandline(); | |
| 35 | |
| 36 | |
| 37 ### only needed for bedGraph output | |
| 38 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files | |
| 39 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total | |
| 40 my @bedfiles; | |
| 41 | |
| 42 ### only needed for genome-wide cytosine methylation report | |
| 43 my %chromosomes; | |
| 44 | |
| 45 my %mbias_1; | |
| 46 my %mbias_2; | |
| 47 | |
| 48 ############################################################################################## | |
| 49 ### Summarising Run Parameters | |
| 50 ############################################################################################## | |
| 51 | |
| 52 ### METHYLATION EXTRACTOR | |
| 53 | |
| 54 warn "Summarising Bismark methylation extractor parameters:\n"; | |
| 55 warn '='x63,"\n"; | |
| 56 | |
| 57 if ($single){ | |
| 58 if ($vanilla){ | |
| 59 warn "Bismark single-end vanilla format specified\n"; | |
| 60 } | |
| 61 else{ | |
| 62 warn "Bismark single-end SAM format specified (default)\n"; # default | |
| 63 } | |
| 64 } | |
| 65 elsif ($paired){ | |
| 66 if ($vanilla){ | |
| 67 warn "Bismark paired-end vanilla format specified\n"; | |
| 68 } | |
| 69 else{ | |
| 70 warn "Bismark paired-end SAM format specified (default)\n"; # default | |
| 71 } | |
| 72 } | |
| 73 | |
| 74 if ($single){ | |
| 75 if ($ignore){ | |
| 76 warn "First $ignore bp will be disregarded when processing the methylation call string\n"; | |
| 77 } | |
| 78 } | |
| 79 else{ ## paired-end | |
| 80 if ($ignore){ | |
| 81 warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n"; | |
| 82 } | |
| 83 if ($ignore_r2){ | |
| 84 warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n"; | |
| 85 } | |
| 86 } | |
| 87 | |
| 88 | |
| 89 if ($full){ | |
| 90 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; | |
| 91 } | |
| 92 if ($merge_non_CpG){ | |
| 93 warn "Merge CHG and CHH context to non-CpG context specified\n"; | |
| 94 } | |
| 95 ### output directory | |
| 96 if ($output_dir eq ''){ | |
| 97 warn "Output will be written to the current directory ('$parent_dir')\n"; | |
| 98 } | |
| 99 else{ | |
| 100 warn "Output path specified as: $output_dir\n"; | |
| 101 } | |
| 102 | |
| 103 | |
| 104 sleep (1); | |
| 105 | |
| 106 ### BEDGRAPH | |
| 107 | |
| 108 if ($bedGraph){ | |
| 109 warn "\n\nSummarising bedGraph parameters:\n"; | |
| 110 warn '='x63,"\n"; | |
| 111 | |
| 112 if ($counts){ | |
| 113 warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage>\ncoverage format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>\n\n"; | |
| 114 } | |
| 115 else{ | |
| 116 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n"; | |
| 117 } | |
| 118 | |
| 119 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n"; | |
| 120 | |
| 121 if ($CX_context){ | |
| 122 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n"; | |
| 123 } | |
| 124 else{ # default | |
| 125 $CpG_only = 1; | |
| 126 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n"; | |
| 127 } | |
| 128 | |
| 129 if ($remove){ | |
| 130 warn "White spaces in read ID names will be removed prior to sorting\n"; | |
| 131 } | |
| 132 | |
| 133 if ($ample_mem){ | |
| 134 warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n"; | |
| 135 } | |
| 136 elsif (defined $sort_size){ | |
| 137 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n"; | |
| 138 } | |
| 139 else{ | |
| 140 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n"; | |
| 141 } | |
| 142 | |
| 143 | |
| 144 | |
| 145 sleep (1); | |
| 146 | |
| 147 if ($cytosine_report){ | |
| 148 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n"; | |
| 149 warn '='x63,"\n"; | |
| 150 warn "Generating comprehensive genome-wide cytosine report\n(output format: <Chromosome> <Position> <Strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> )\n"; | |
| 151 | |
| 152 | |
| 153 if ($CX_context){ | |
| 154 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n"; | |
| 155 } | |
| 156 else{ # default | |
| 157 $CpG_only = 1; | |
| 158 warn "Reporting cytosine methylation in CpG context only (default)\n"; | |
| 159 } | |
| 160 | |
| 161 if ($split_by_chromosome){ | |
| 162 warn "Splitting the cytosine report output up into individual files for each chromosome\n"; | |
| 163 } | |
| 164 | |
| 165 ### Zero-based coordinates | |
| 166 if ($zero){ | |
| 167 warn "Using zero-based genomic coordinates (user-defined)\n"; | |
| 168 } | |
| 169 else{ # default, 1-based coords | |
| 170 warn "Using 1-based genomic coordinates (default)\n"; | |
| 171 } | |
| 172 | |
| 173 ### GENOME folder | |
| 174 if ($genome_folder){ | |
| 175 unless ($genome_folder =~/\/$/){ | |
| 176 $genome_folder =~ s/$/\//; | |
| 177 } | |
| 178 warn "Genome folder was specified as $genome_folder\n"; | |
| 179 } | |
| 180 else{ | |
| 181 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/'; | |
| 182 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n"; | |
| 183 } | |
| 184 sleep (1); | |
| 185 } | |
| 186 } | |
| 187 | |
| 188 warn "\n"; | |
| 189 sleep (5); | |
| 190 | |
| 191 ###################################################### | |
| 192 ### PROCESSING FILES | |
| 193 ###################################################### | |
| 194 | |
| 195 foreach my $filename (@filenames){ | |
| 196 # resetting counters and filehandles | |
| 197 %fhs = (); | |
| 198 %counting =( | |
| 199 total_meCHG_count => 0, | |
| 200 total_meCHH_count => 0, | |
| 201 total_meCpG_count => 0, | |
| 202 total_unmethylated_CHG_count => 0, | |
| 203 total_unmethylated_CHH_count => 0, | |
| 204 total_unmethylated_CpG_count => 0, | |
| 205 sequences_count => 0, | |
| 206 ); | |
| 207 | |
| 208 @sorting_files = (); | |
| 209 @bedfiles = (); | |
| 210 | |
| 211 %mbias_1 = (); | |
| 212 %mbias_2 = (); | |
| 213 | |
| 214 ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor | |
| 215 unless ($vanilla){ | |
| 216 if ($paired){ | |
| 217 test_positional_sorting($filename); | |
| 218 } | |
| 219 } | |
| 220 | |
| 221 process_Bismark_results_file($filename); | |
| 222 | |
| 223 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues | |
| 224 foreach my $fh (keys %fhs) { | |
| 225 if ($fh =~ /^[1230]$/) { | |
| 226 foreach my $context (keys %{$fhs{$fh}}) { | |
| 227 close $fhs{$fh}->{$context} or die $!; | |
| 228 } | |
| 229 } | |
| 230 else{ | |
| 231 close $fhs{$fh} or die $!; | |
| 232 } | |
| 233 } | |
| 234 | |
| 235 ### printing out all M-Bias data | |
| 236 produce_mbias_plots ($filename); | |
| 237 | |
| 238 delete_unused_files(); | |
| 239 | |
| 240 if ($bedGraph){ | |
| 241 | |
| 242 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified | |
| 243 $out =~ s/gz$//; | |
| 244 $out =~ s/sam$//; | |
| 245 $out =~ s/bam$//; | |
| 246 $out =~ s/txt$//; | |
| 247 $out =~ s/$/bedGraph/; | |
| 248 | |
| 249 my $bedGraph_output = $out; | |
| 250 my @args; | |
| 251 | |
| 252 if ($remove){ | |
| 253 push @args, '--remove'; | |
| 254 } | |
| 255 if ($CX_context){ | |
| 256 push @args, '--CX_context'; | |
| 257 } | |
| 258 if ($no_header){ | |
| 259 push @args, '--no_header'; | |
| 260 } | |
| 261 if ($gazillion){ | |
| 262 push @args, '--gazillion'; | |
| 263 } | |
| 264 if ($ample_mem){ | |
| 265 push @args, '--ample_memory'; | |
| 266 } | |
| 267 | |
| 268 | |
| 269 # if ($counts){ | |
| 270 # push @args, "--counts"; | |
| 271 # } | |
| 272 | |
| 273 push @args, "--buffer_size $sort_size"; | |
| 274 push @args, "--cutoff $coverage_threshold"; | |
| 275 push @args, "--output $bedGraph_output"; | |
| 276 push @args, "--dir '$output_dir'"; | |
| 277 | |
| 278 ### adding all files to be sorted to @args | |
| 279 foreach my $f (@sorting_files){ | |
| 280 push @args, $f; | |
| 281 } | |
| 282 | |
| 283 # print join "\t",@args,"\n"; | |
| 284 | |
| 285 system ("$Bin/bismark2bedGraph @args"); | |
| 286 | |
| 287 warn "Finished BedGraph conversion ...\n\n"; | |
| 288 sleep(3); | |
| 289 | |
| 290 # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!"; | |
| 291 # warn "Writing bedGraph to file: $bedGraph_output\n"; | |
| 292 # process_bedGraph_output(); | |
| 293 # close OUT or die $!; | |
| 294 | |
| 295 ### genome-wide cytosine methylation report requires bedGraph processing anyway | |
| 296 if ($cytosine_report){ | |
| 297 | |
| 298 @args = (); # resetting @args | |
| 299 my $cytosine_out = $out; | |
| 300 $cytosine_out =~ s/bedGraph$//; | |
| 301 | |
| 302 if ($CX_context){ | |
| 303 $cytosine_out =~ s/$/CX_report.txt/; | |
| 304 } | |
| 305 else{ | |
| 306 $cytosine_out =~ s/$/CpG_report.txt/; | |
| 307 } | |
| 308 | |
| 309 push @args, "--output $cytosine_out"; | |
| 310 push @args, "--dir '$output_dir'"; | |
| 311 push @args, "--genome '$genome_folder'"; | |
| 312 push @args, "--parent_dir '$parent_dir'"; | |
| 313 | |
| 314 if ($zero){ | |
| 315 push @args, "--zero"; | |
| 316 } | |
| 317 if ($CX_context){ | |
| 318 push @args, '--CX_context'; | |
| 319 } | |
| 320 if ($split_by_chromosome){ | |
| 321 push @args, '--split_by_chromosome'; | |
| 322 } | |
| 323 | |
| 324 my $coverage_output = $bedGraph_output; | |
| 325 $coverage_output =~ s/bedGraph$/bismark.cov/; | |
| 326 | |
| 327 push @args, $output_dir . $coverage_output; # this will be the infile | |
| 328 | |
| 329 system ("$Bin/coverage2cytosine @args"); | |
| 330 # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out); | |
| 331 warn "\n\nFinished generating genome-wide cytosine report\n\n"; | |
| 332 } | |
| 333 } | |
| 334 } | |
| 335 | |
| 336 sub delete_unused_files{ | |
| 337 | |
| 338 warn "Deleting unused files ...\n\n"; sleep(1); | |
| 339 | |
| 340 my $index = 0; | |
| 341 | |
| 342 while ($index <= $#sorting_files){ | |
| 343 if ($sorting_files[$index] =~ /gz$/){ | |
| 344 open (USED,"zcat $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; | |
| 345 } | |
| 346 else{ | |
| 347 open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; | |
| 348 } | |
| 349 | |
| 350 my $used = 0; | |
| 351 | |
| 352 while (<USED>){ | |
| 353 next if (/^Bismark/); | |
| 354 if ($_){ | |
| 355 $used = 1; | |
| 356 last; | |
| 357 } | |
| 358 } | |
| 359 | |
| 360 if ($used){ | |
| 361 warn "$sorting_files[$index] contains data ->\tkept\n"; | |
| 362 ++$index; | |
| 363 } | |
| 364 else{ | |
| 365 | |
| 366 my $delete = unlink $sorting_files[$index]; | |
| 367 | |
| 368 if ($delete){ | |
| 369 warn "$sorting_files[$index] was empty ->\tdeleted\n"; | |
| 370 } | |
| 371 else{ | |
| 372 warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n" | |
| 373 } | |
| 374 | |
| 375 ### we also need to remove the element from @sorting_files | |
| 376 splice @sorting_files, $index, 1; | |
| 377 } | |
| 378 } | |
| 379 warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately) | |
| 380 } | |
| 381 | |
| 382 sub produce_mbias_plots{ | |
| 383 | |
| 384 my $filename = shift; | |
| 385 | |
| 386 my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified | |
| 387 $mbias =~ s/gz$//; | |
| 388 $mbias =~ s/sam$//; | |
| 389 $mbias =~ s/bam$//; | |
| 390 $mbias =~ s/txt$//; | |
| 391 my $mbias_graph_1 = my $mbias_graph_2 = $mbias; | |
| 392 $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png'; | |
| 393 $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png'; | |
| 394 | |
| 395 $mbias =~ s/$/M-bias.txt/; | |
| 396 | |
| 397 open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n"; | |
| 398 | |
| 399 # determining maximum read length | |
| 400 my $max_length_1 = 0; | |
| 401 my $max_length_2 = 0; | |
| 402 | |
| 403 foreach my $context (keys %mbias_1){ | |
| 404 foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){ | |
| 405 $max_length_1 = $pos unless ($max_length_1 >= $pos); | |
| 406 } | |
| 407 } | |
| 408 if ($paired){ | |
| 409 foreach my $context (keys %mbias_2){ | |
| 410 foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){ | |
| 411 $max_length_2 = $pos unless ($max_length_2 >= $pos); | |
| 412 } | |
| 413 } | |
| 414 } | |
| 415 | |
| 416 if ($single){ | |
| 417 warn "Determining maximum read length for M-Bias plot\n"; | |
| 418 warn "Maximum read length of Read 1: $max_length_1\n\n"; | |
| 419 } | |
| 420 else{ | |
| 421 warn "Determining maximum read lengths for M-Bias plots\n"; | |
| 422 warn "Maximum read length of Read 1: $max_length_1\n"; | |
| 423 warn "Maximum read length of Read 2: $max_length_2\n\n"; | |
| 424 } | |
| 425 # sleep(3); | |
| 426 | |
| 427 my @mbias_read1; | |
| 428 my @mbias_read2; | |
| 429 | |
| 430 #Check whether the module GD::Graph:lines is installed | |
| 431 my $gd_graph_installed = 0; | |
| 432 eval{ | |
| 433 require GD::Graph::lines; | |
| 434 GD::Graph::lines->import(); | |
| 435 }; | |
| 436 | |
| 437 unless($@) { # syntax or routine error variable, set if something goes wron in the last eval{ require ...} | |
| 438 $gd_graph_installed = 1; | |
| 439 | |
| 440 #Check whether the module GD::Graph::colour is installed | |
| 441 eval{ | |
| 442 require GD::Graph::colour; | |
| 443 GD::Graph::colour->import(qw(:colours :lists :files :convert)); | |
| 444 }; | |
| 445 | |
| 446 if ($@) { | |
| 447 warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; | |
| 448 sleep(2); | |
| 449 $gd_graph_installed = 0; | |
| 450 } | |
| 451 | |
| 452 | |
| 453 } | |
| 454 else{ | |
| 455 warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; | |
| 456 sleep(2); | |
| 457 } | |
| 458 | |
| 459 | |
| 460 my $graph_title; | |
| 461 my $graph1; | |
| 462 my $graph2; | |
| 463 | |
| 464 if ( $gd_graph_installed){ | |
| 465 $graph1 = GD::Graph::lines->new(800,600); | |
| 466 if ($paired){ | |
| 467 $graph2 = GD::Graph::lines->new(800,600); | |
| 468 } | |
| 469 } | |
| 470 | |
| 471 foreach my $context (qw(CpG CHG CHH)){ | |
| 472 @{$mbias_read1[0]} = (); | |
| 473 | |
| 474 if ($paired){ | |
| 475 print MBIAS "$context context (R1)\n================\n"; | |
| 476 $graph_title = 'M-bias (Read 1)'; | |
| 477 } | |
| 478 else{ | |
| 479 print MBIAS "$context context\n===========\n"; | |
| 480 $graph_title = 'M-bias'; | |
| 481 } | |
| 482 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; | |
| 483 | |
| 484 foreach my $pos (1..$max_length_1){ | |
| 485 | |
| 486 unless (defined $mbias_1{$context}->{$pos}->{meth}){ | |
| 487 $mbias_1{$context}->{$pos}->{meth} = 0; | |
| 488 } | |
| 489 unless (defined $mbias_1{$context}->{$pos}->{un}){ | |
| 490 $mbias_1{$context}->{$pos}->{un} = 0; | |
| 491 } | |
| 492 | |
| 493 my $percent = ''; | |
| 494 if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){ | |
| 495 $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) ); | |
| 496 } | |
| 497 my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth}; | |
| 498 | |
| 499 print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n"; | |
| 500 push @{$mbias_read1[0]},$pos; | |
| 501 | |
| 502 if ($context eq 'CpG'){ | |
| 503 push @{$mbias_read1[1]},$percent; | |
| 504 push @{$mbias_read1[4]},$coverage; | |
| 505 } | |
| 506 elsif ($context eq 'CHG'){ | |
| 507 push @{$mbias_read1[2]},$percent; | |
| 508 push @{$mbias_read1[5]},$coverage; | |
| 509 } | |
| 510 elsif ($context eq 'CHH'){ | |
| 511 push @{$mbias_read1[3]},$percent; | |
| 512 push @{$mbias_read1[6]},$coverage; | |
| 513 } | |
| 514 } | |
| 515 print MBIAS "\n"; | |
| 516 } | |
| 517 | |
| 518 if ( $gd_graph_installed){ | |
| 519 | |
| 520 add_colour(nice_blue => [31,120,180]); | |
| 521 add_colour(nice_orange => [255,127,0]); | |
| 522 add_colour(nice_green => [51,160,44]); | |
| 523 add_colour(pale_blue => [153,206,227]); | |
| 524 add_colour(pale_orange => [253,204,138]); | |
| 525 add_colour(pale_green => [191,230,207]); | |
| 526 | |
| 527 $graph1->set( | |
| 528 x_label => 'position (bp)', | |
| 529 y1_label => '% methylation', | |
| 530 y2_label => '# methylation calls', | |
| 531 title => $graph_title, | |
| 532 line_width => 2, | |
| 533 x_max_value => $max_length_1, | |
| 534 x_min_value => 0, | |
| 535 y_tick_number => 10, | |
| 536 y_label_skip => 2, | |
| 537 y1_max_value => 100, | |
| 538 y1_min_value => 0, | |
| 539 y_label_skip => 2, | |
| 540 y2_min_value => 0, | |
| 541 x_label_skip => 5, | |
| 542 x_label_position => 0.5, | |
| 543 x_tick_offset => -1, | |
| 544 bgclr => 'white', | |
| 545 transparent => 0, | |
| 546 two_axes => 1, | |
| 547 use_axis => [1,1,1,2,2,2], | |
| 548 legend_placement => 'RC', | |
| 549 legend_spacing => 6, | |
| 550 legend_marker_width => 24, | |
| 551 legend_marker_height => 18, | |
| 552 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], | |
| 553 ) or die $graph1->error; | |
| 554 | |
| 555 $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); | |
| 556 | |
| 557 my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error; | |
| 558 | |
| 559 open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n"; | |
| 560 binmode MBIAS_G1; | |
| 561 print MBIAS_G1 $gd1->png; | |
| 562 } | |
| 563 | |
| 564 if ($paired){ | |
| 565 | |
| 566 foreach my $context (qw(CpG CHG CHH)){ | |
| 567 @{$mbias_read2[0]} = (); | |
| 568 | |
| 569 print MBIAS "$context context (R2)\n================\n"; | |
| 570 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; | |
| 571 foreach my $pos (1..$max_length_2){ | |
| 572 | |
| 573 unless (defined $mbias_2{$context}->{$pos}->{meth}){ | |
| 574 $mbias_2{$context}->{$pos}->{meth} = 0; | |
| 575 } | |
| 576 unless (defined $mbias_2{$context}->{$pos}->{un}){ | |
| 577 $mbias_2{$context}->{$pos}->{un} = 0; | |
| 578 } | |
| 579 | |
| 580 my $percent = ''; | |
| 581 if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){ | |
| 582 $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) ); | |
| 583 } | |
| 584 my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth}; | |
| 585 | |
| 586 print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n"; | |
| 587 | |
| 588 push @{$mbias_read2[0]},$pos; | |
| 589 | |
| 590 if ($context eq 'CpG'){ | |
| 591 push @{$mbias_read2[1]},$percent; | |
| 592 push @{$mbias_read2[4]},$coverage; | |
| 593 } | |
| 594 elsif ($context eq 'CHG'){ | |
| 595 push @{$mbias_read2[2]},$percent; | |
| 596 push @{$mbias_read2[5]},$coverage; | |
| 597 } | |
| 598 elsif ($context eq 'CHH'){ | |
| 599 push @{$mbias_read2[3]},$percent; | |
| 600 push @{$mbias_read2[6]},$coverage; | |
| 601 } | |
| 602 } | |
| 603 print MBIAS "\n"; | |
| 604 } | |
| 605 | |
| 606 if ( $gd_graph_installed){ | |
| 607 | |
| 608 add_colour(nice_blue => [31,120,180]); | |
| 609 add_colour(nice_orange => [255,127,0]); | |
| 610 add_colour(nice_green => [51,160,44]); | |
| 611 add_colour(pale_blue => [153,206,227]); | |
| 612 add_colour(pale_orange => [253,204,138]); | |
| 613 add_colour(pale_green => [191,230,207]); | |
| 614 | |
| 615 $graph2->set( | |
| 616 x_label => 'position (bp)', | |
| 617 line_width => 2, | |
| 618 x_max_value => $max_length_1, | |
| 619 x_min_value => 0, | |
| 620 y_tick_number => 10, | |
| 621 y_label_skip => 2, | |
| 622 y1_max_value => 100, | |
| 623 y1_min_value => 0, | |
| 624 y_label_skip => 2, | |
| 625 y2_min_value => 0, | |
| 626 x_label_skip => 5, | |
| 627 x_label_position => 0.5, | |
| 628 x_tick_offset => -1, | |
| 629 bgclr => 'white', | |
| 630 transparent => 0, | |
| 631 two_axes => 1, | |
| 632 use_axis => [1,1,1,2,2,2], | |
| 633 legend_placement => 'RC', | |
| 634 legend_spacing => 6, | |
| 635 legend_marker_width => 24, | |
| 636 legend_marker_height => 18, | |
| 637 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], | |
| 638 x_label => 'position (bp)', | |
| 639 y1_label => '% methylation', | |
| 640 y2_label => '# calls', | |
| 641 title => 'M-bias (Read 2)', | |
| 642 ) or die $graph2->error; | |
| 643 | |
| 644 $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); | |
| 645 my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error; | |
| 646 | |
| 647 open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n"; | |
| 648 binmode MBIAS_G2; | |
| 649 print MBIAS_G2 $gd2->png; | |
| 650 | |
| 651 } | |
| 652 } | |
| 653 } | |
| 654 | |
| 655 sub process_commandline{ | |
| 656 my $help; | |
| 657 my $single_end; | |
| 658 my $paired_end; | |
| 659 my $ignore; | |
| 660 my $ignore_r2; | |
| 661 my $genomic_fasta; | |
| 662 my $full; | |
| 663 my $report; | |
| 664 my $extractor_version; | |
| 665 my $no_overlap; | |
| 666 my $merge_non_CpG; | |
| 667 my $vanilla; | |
| 668 my $output_dir; | |
| 669 my $no_header; | |
| 670 my $bedGraph; | |
| 671 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status | |
| 672 my $remove; | |
| 673 my $counts; | |
| 674 my $cytosine_report; | |
| 675 my $genome_folder; | |
| 676 my $zero; | |
| 677 my $CpG_only; | |
| 678 my $CX_context; | |
| 679 my $split_by_chromosome; | |
| 680 my $sort_size; | |
| 681 my $samtools_path; | |
| 682 my $gzip; | |
| 683 my $mbias_only; | |
| 684 my $gazillion; | |
| 685 my $ample_mem; | |
| 686 | |
| 687 my $command_line = GetOptions ('help|man' => \$help, | |
| 688 'p|paired-end' => \$paired_end, | |
| 689 's|single-end' => \$single_end, | |
| 690 'fasta' => \$genomic_fasta, | |
| 691 'ignore=i' => \$ignore, | |
| 692 'ignore_r2=i' => \$ignore_r2, | |
| 693 'comprehensive' => \$full, | |
| 694 'report' => \$report, | |
| 695 'version' => \$extractor_version, | |
| 696 'no_overlap' => \$no_overlap, | |
| 697 'merge_non_CpG' => \$merge_non_CpG, | |
| 698 'vanilla' => \$vanilla, | |
| 699 'o|output=s' => \$output_dir, | |
| 700 'no_header' => \$no_header, | |
| 701 'bedGraph' => \$bedGraph, | |
| 702 "cutoff=i" => \$coverage_threshold, | |
| 703 "remove_spaces" => \$remove, | |
| 704 "counts" => \$counts, | |
| 705 "cytosine_report" => \$cytosine_report, | |
| 706 'g|genome_folder=s' => \$genome_folder, | |
| 707 "zero_based" => \$zero, | |
| 708 "CX|CX_context" => \$CX_context, | |
| 709 "split_by_chromosome" => \$split_by_chromosome, | |
| 710 "buffer_size=s" => \$sort_size, | |
| 711 'samtools_path=s' => \$samtools_path, | |
| 712 "gzip" => \$gzip, | |
| 713 "mbias_only" => \$mbias_only, | |
| 714 "gazillion|scaffolds" => \$gazillion, | |
| 715 "ample_memory" => \$ample_mem, | |
| 716 ); | |
| 717 | |
| 718 ### EXIT ON ERROR if there were errors with any of the supplied options | |
| 719 unless ($command_line){ | |
| 720 die "Please respecify command line options\n"; | |
| 721 } | |
| 722 | |
| 723 ### HELPFILE | |
| 724 if ($help){ | |
| 725 print_helpfile(); | |
| 726 exit; | |
| 727 } | |
| 728 | |
| 729 if ($extractor_version){ | |
| 730 print << "VERSION"; | |
| 731 | |
| 732 | |
| 733 Bismark Methylation Extractor | |
| 734 | |
| 735 Bismark Extractor Version: $version | |
| 736 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics | |
| 737 www.bioinformatics.babraham.ac.uk/projects/bismark/ | |
| 738 | |
| 739 | |
| 740 VERSION | |
| 741 exit; | |
| 742 } | |
| 743 | |
| 744 | |
| 745 ### no files provided | |
| 746 unless (@ARGV){ | |
| 747 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; | |
| 748 } | |
| 749 @filenames = @ARGV; | |
| 750 | |
| 751 warn "\n *** Bismark methylation extractor version $version ***\n\n"; | |
| 752 | |
| 753 ### M-BIAS ONLY | |
| 754 if ($mbias_only){ | |
| 755 if ($bedGraph){ | |
| 756 die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n"; | |
| 757 } | |
| 758 if ($cytosine_report){ | |
| 759 die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n"; | |
| 760 } | |
| 761 if ($merge_non_CpG){ | |
| 762 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n"; | |
| 763 } | |
| 764 if ($full){ | |
| 765 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n"; | |
| 766 } | |
| 767 sleep(3); | |
| 768 } | |
| 769 | |
| 770 ### PRINT A REPORT | |
| 771 unless ($report){ | |
| 772 $report = 0; | |
| 773 } | |
| 774 | |
| 775 ### OUTPUT DIR PATH | |
| 776 if ($output_dir){ | |
| 777 unless ($output_dir =~ /\/$/){ | |
| 778 $output_dir =~ s/$/\//; | |
| 779 } | |
| 780 } | |
| 781 else{ | |
| 782 $output_dir = ''; | |
| 783 } | |
| 784 | |
| 785 ### NO HEADER | |
| 786 unless ($no_header){ | |
| 787 $no_header = 0; | |
| 788 } | |
| 789 | |
| 790 ### OLD (VANILLA) OUTPUT FORMAT | |
| 791 unless ($vanilla){ | |
| 792 $vanilla = 0; | |
| 793 } | |
| 794 | |
| 795 if ($single_end){ | |
| 796 $paired_end = 0; ### SINGLE END ALIGNMENTS | |
| 797 } | |
| 798 elsif ($paired_end){ | |
| 799 $single_end = 0; ### PAIRED-END ALIGNMENTS | |
| 800 } | |
| 801 else{ | |
| 802 | |
| 803 ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header | |
| 804 | |
| 805 if ($vanilla){ | |
| 806 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n"; | |
| 807 } | |
| 808 else{ # SAM/BAM format | |
| 809 | |
| 810 my $file = $filenames[0]; | |
| 811 warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1); | |
| 812 | |
| 813 ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file | |
| 814 if ($file =~ /\.gz$/){ | |
| 815 open (DETERMINE,"zcat $file |") or die "Unable to read from gzipped file $file: $!\n"; | |
| 816 } | |
| 817 elsif ($file =~ /\.bam$/ || `file -b $file` =~ /^gzip/){ | |
| 818 open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM file $file: $!\n"; | |
| 819 } | |
| 820 else{ | |
| 821 open (DETERMINE,$file) or die "Unable to read from $file: $!\n"; | |
| 822 } | |
| 823 | |
| 824 while (<DETERMINE>){ | |
| 825 last unless (/^\@/); | |
| 826 if ($_ =~ /^\@PG/){ | |
| 827 # warn "found the \@PG line:\n"; | |
| 828 # warn "$_"; | |
| 829 | |
| 830 if ($_ =~ /-1/ and $_ =~ /-2/){ | |
| 831 warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1); | |
| 832 $paired_end = 1; | |
| 833 $single_end = 0; | |
| 834 } | |
| 835 else{ | |
| 836 warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1); | |
| 837 $paired_end = 0; | |
| 838 $single_end = 1; | |
| 839 } | |
| 840 } | |
| 841 } | |
| 842 | |
| 843 close DETERMINE or warn $!; | |
| 844 | |
| 845 } | |
| 846 } | |
| 847 | |
| 848 ### IGNORING <INT> bases at the start of the read when processing the methylation call string | |
| 849 unless ($ignore){ | |
| 850 $ignore = 0; | |
| 851 } | |
| 852 | |
| 853 if (defined $ignore_r2){ | |
| 854 die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end); | |
| 855 } | |
| 856 else{ | |
| 857 $ignore_r2 = 0; | |
| 858 } | |
| 859 | |
| 860 | |
| 861 ### NO OVERLAP | |
| 862 if ($no_overlap){ | |
| 863 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end); | |
| 864 } | |
| 865 else{ | |
| 866 $no_overlap = 0; | |
| 867 } | |
| 868 | |
| 869 ### COMPREHENSIVE OUTPUT | |
| 870 unless ($full){ | |
| 871 $full = 0; | |
| 872 } | |
| 873 | |
| 874 ### MERGE NON-CpG context | |
| 875 unless ($merge_non_CpG){ | |
| 876 $merge_non_CpG = 0; | |
| 877 } | |
| 878 | |
| 879 ### remove white spaces in read ID (needed for sorting using the sort command | |
| 880 unless ($remove){ | |
| 881 $remove = 0; | |
| 882 } | |
| 883 | |
| 884 ### COVERAGE THRESHOLD FOR bedGraph OUTPUT | |
| 885 if (defined $coverage_threshold){ | |
| 886 unless ($coverage_threshold > 0){ | |
| 887 die "Please select a coverage greater than 0 (positive integers only)\n"; | |
| 888 } | |
| 889 } | |
| 890 else{ | |
| 891 $coverage_threshold = 1; | |
| 892 } | |
| 893 | |
| 894 ### SORT buffer size | |
| 895 if (defined $sort_size){ | |
| 896 unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){ | |
| 897 die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n"; | |
| 898 } | |
| 899 } | |
| 900 else{ | |
| 901 $sort_size = '2G'; | |
| 902 } | |
| 903 | |
| 904 if ($zero){ | |
| 905 die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report); | |
| 906 } | |
| 907 | |
| 908 if ($CX_context){ | |
| 909 die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); | |
| 910 } | |
| 911 else{ | |
| 912 $CX_context = 0; | |
| 913 } | |
| 914 | |
| 915 unless ($counts){ | |
| 916 $counts = 1; # counts will always be set | |
| 917 } | |
| 918 | |
| 919 if ($cytosine_report){ | |
| 920 | |
| 921 ### GENOME folder | |
| 922 if ($genome_folder){ | |
| 923 unless ($genome_folder =~/\/$/){ | |
| 924 $genome_folder =~ s/$/\//; | |
| 925 } | |
| 926 } | |
| 927 else{ | |
| 928 die "Please specify a genome folder to proceed (full path only)\n"; | |
| 929 } | |
| 930 | |
| 931 unless ($bedGraph){ | |
| 932 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; | |
| 933 $bedGraph = 1; | |
| 934 } | |
| 935 unless ($counts){ | |
| 936 # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; | |
| 937 $counts = 1; | |
| 938 } | |
| 939 warn "\n"; | |
| 940 } | |
| 941 | |
| 942 ### PATH TO SAMTOOLS | |
| 943 if (defined $samtools_path){ | |
| 944 # if Samtools was specified as full command | |
| 945 if ($samtools_path =~ /samtools$/){ | |
| 946 if (-e $samtools_path){ | |
| 947 # Samtools executable found | |
| 948 } | |
| 949 else{ | |
| 950 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
| 951 } | |
| 952 } | |
| 953 else{ | |
| 954 unless ($samtools_path =~ /\/$/){ | |
| 955 $samtools_path =~ s/$/\//; | |
| 956 } | |
| 957 $samtools_path .= 'samtools'; | |
| 958 if (-e $samtools_path){ | |
| 959 # Samtools executable found | |
| 960 } | |
| 961 else{ | |
| 962 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
| 963 } | |
| 964 } | |
| 965 } | |
| 966 # Check whether Samtools is in the PATH if no path was supplied by the user | |
| 967 else{ | |
| 968 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH | |
| 969 $samtools_path = `which samtools`; | |
| 970 chomp $samtools_path; | |
| 971 } | |
| 972 } | |
| 973 | |
| 974 unless (defined $samtools_path){ | |
| 975 $samtools_path = ''; | |
| 976 } | |
| 977 | |
| 978 | |
| 979 if ($gazillion){ | |
| 980 if ($ample_mem){ | |
| 981 die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n"; | |
| 982 } | |
| 983 } | |
| 984 | |
| 985 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem); | |
| 986 } | |
| 987 | |
| 988 | |
| 989 sub test_positional_sorting{ | |
| 990 | |
| 991 my $filename = shift; | |
| 992 | |
| 993 print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t"; | |
| 994 sleep(1); | |
| 995 | |
| 996 if ($filename =~ /\.gz$/) { | |
| 997 open (TEST,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; | |
| 998 } | |
| 999 elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) { | |
| 1000 if ($samtools_path){ | |
| 1001 open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; | |
| 1002 } | |
| 1003 else{ | |
| 1004 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; | |
| 1005 } | |
| 1006 } | |
| 1007 else { | |
| 1008 open (TEST,$filename) or die "Can't open file $filename: $!\n"; | |
| 1009 } | |
| 1010 | |
| 1011 my $count = 0; | |
| 1012 | |
| 1013 while (<TEST>) { | |
| 1014 if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted) | |
| 1015 if (/^\@SO/) { | |
| 1016 die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n"; | |
| 1017 } | |
| 1018 next; | |
| 1019 } | |
| 1020 $count++; | |
| 1021 | |
| 1022 last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID | |
| 1023 | |
| 1024 my ($id_1) = (split (/\t/)); | |
| 1025 | |
| 1026 ### reading the next line which should be read 2 | |
| 1027 $_ = <TEST>; | |
| 1028 my ($id_2) = (split (/\t/)); | |
| 1029 last unless ($id_2); | |
| 1030 ++$count; | |
| 1031 | |
| 1032 if ($id_1 eq $id_2){ | |
| 1033 ### ids are the same | |
| 1034 next; | |
| 1035 } | |
| 1036 else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first | |
| 1037 my $id_1_trunc = $id_1; | |
| 1038 $id_1_trunc =~ s/\/1$//; | |
| 1039 my $id_2_trunc = $id_2; | |
| 1040 $id_2_trunc =~ s/\/2$//; | |
| 1041 | |
| 1042 unless ($id_1_trunc eq $id_2_trunc){ | |
| 1043 die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n"; | |
| 1044 } | |
| 1045 } | |
| 1046 } | |
| 1047 # close TEST or die $!; somehow fails on our cluster... | |
| 1048 ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other) | |
| 1049 warn "...passed!\n"; | |
| 1050 sleep(1); | |
| 1051 | |
| 1052 } | |
| 1053 | |
| 1054 | |
| 1055 sub process_Bismark_results_file{ | |
| 1056 my $filename = shift; | |
| 1057 | |
| 1058 warn "\nNow reading in Bismark result file $filename\n\n"; | |
| 1059 | |
| 1060 if ($filename =~ /\.gz$/) { | |
| 1061 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; | |
| 1062 } | |
| 1063 elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) { | |
| 1064 if ($samtools_path){ | |
| 1065 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; | |
| 1066 } | |
| 1067 else{ | |
| 1068 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; | |
| 1069 } | |
| 1070 } | |
| 1071 else { | |
| 1072 open (IN,$filename) or die "Can't open file $filename: $!\n"; | |
| 1073 } | |
| 1074 | |
| 1075 ### Vanilla and SAM output need to read different numbers of header lines | |
| 1076 if ($vanilla) { | |
| 1077 my $bismark_version = <IN>; ## discarding the Bismark version info | |
| 1078 chomp $bismark_version; | |
| 1079 $bismark_version =~ s/\r//; # replaces \r line feed | |
| 1080 $bismark_version =~ s/Bismark version: //; | |
| 1081 if ($bismark_version =~ /^\@/) { | |
| 1082 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n"; | |
| 1083 sleep (2); | |
| 1084 } | |
| 1085 | |
| 1086 unless ($version eq $bismark_version){ | |
| 1087 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n"; | |
| 1088 } | |
| 1089 } else { | |
| 1090 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly. | |
| 1091 # We are reading from it further down | |
| 1092 } | |
| 1093 | |
| 1094 my $output_filename = (split (/\//,$filename))[-1]; | |
| 1095 | |
| 1096 ### OPENING OUTPUT-FILEHANDLES | |
| 1097 if ($report) { | |
| 1098 my $report_filename = $output_filename; | |
| 1099 $report_filename =~ s/\.sam$//; | |
| 1100 $report_filename =~ s/\.txt$//; | |
| 1101 $report_filename =~ s/$/_splitting_report.txt/; | |
| 1102 $report_filename = $output_dir . $report_filename; | |
| 1103 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n"; | |
| 1104 } | |
| 1105 | |
| 1106 if ($report) { | |
| 1107 print REPORT "$output_filename\n\n"; | |
| 1108 print REPORT "Parameters used to extract methylation information:\n"; | |
| 1109 if ($paired) { | |
| 1110 if ($vanilla) { | |
| 1111 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n"; | |
| 1112 } else { | |
| 1113 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default | |
| 1114 } | |
| 1115 } | |
| 1116 | |
| 1117 if ($single) { | |
| 1118 if ($vanilla) { | |
| 1119 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; | |
| 1120 } else { | |
| 1121 print REPORT "Bismark result file: single-end (SAM format)\n"; # default | |
| 1122 } | |
| 1123 } | |
| 1124 if ($single){ | |
| 1125 if ($ignore) { | |
| 1126 print REPORT "Ignoring first $ignore bp\n"; | |
| 1127 } | |
| 1128 } | |
| 1129 else{ # paired-end | |
| 1130 if ($ignore) { | |
| 1131 print REPORT "Ignoring first $ignore bp of Read 1\n"; | |
| 1132 } | |
| 1133 if ($ignore_r2){ | |
| 1134 print REPORT "Ignoring first $ignore_r2 bp of Read 2\n"; | |
| 1135 } | |
| 1136 } | |
| 1137 | |
| 1138 if ($full) { | |
| 1139 print REPORT "Output specified: comprehensive\n"; | |
| 1140 } else { | |
| 1141 print REPORT "Output specified: strand-specific (default)\n"; | |
| 1142 } | |
| 1143 | |
| 1144 if ($no_overlap) { | |
| 1145 print REPORT "No overlapping methylation calls specified\n"; | |
| 1146 } | |
| 1147 if ($genomic_fasta) { | |
| 1148 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n"; | |
| 1149 } | |
| 1150 if ($merge_non_CpG) { | |
| 1151 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n"; | |
| 1152 } | |
| 1153 | |
| 1154 print REPORT "\n"; | |
| 1155 } | |
| 1156 | |
| 1157 ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
| 1158 | |
| 1159 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL | |
| 1160 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file | |
| 1161 if ($full and $merge_non_CpG) { | |
| 1162 my $cpg_output = my $other_c_output = $output_filename; | |
| 1163 ### C in CpG context | |
| 1164 $cpg_output =~ s/^/CpG_context_/; | |
| 1165 $cpg_output =~ s/sam$/txt/; | |
| 1166 $cpg_output =~ s/bam$/txt/; | |
| 1167 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); | |
| 1168 $cpg_output = $output_dir . $cpg_output; | |
| 1169 | |
| 1170 if ($gzip){ | |
| 1171 $cpg_output .= '.gz'; | |
| 1172 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); | |
| 1173 } | |
| 1174 else{ | |
| 1175 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); | |
| 1176 } | |
| 1177 | |
| 1178 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); | |
| 1179 push @sorting_files,$cpg_output; | |
| 1180 | |
| 1181 unless ($no_header) { | |
| 1182 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1183 } | |
| 1184 | |
| 1185 ### C in any other context than CpG | |
| 1186 $other_c_output =~ s/^/Non_CpG_context_/; | |
| 1187 $other_c_output =~ s/sam$/txt/; | |
| 1188 $other_c_output =~ s/bam$/txt/; | |
| 1189 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); | |
| 1190 $other_c_output = $output_dir . $other_c_output; | |
| 1191 | |
| 1192 if ($gzip){ | |
| 1193 $other_c_output .= '.gz'; | |
| 1194 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only); | |
| 1195 } | |
| 1196 else{ | |
| 1197 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only); | |
| 1198 } | |
| 1199 | |
| 1200 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only); | |
| 1201 push @sorting_files,$other_c_output; | |
| 1202 | |
| 1203 | |
| 1204 unless ($no_header) { | |
| 1205 print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1206 } | |
| 1207 } | |
| 1208 | |
| 1209 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found | |
| 1210 elsif ($merge_non_CpG) { | |
| 1211 | |
| 1212 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; | |
| 1213 | |
| 1214 ### For cytosines in CpG context | |
| 1215 $cpg_ot =~ s/^/CpG_OT_/; | |
| 1216 $cpg_ot =~ s/sam$/txt/; | |
| 1217 $cpg_ot =~ s/bam$/txt/; | |
| 1218 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); | |
| 1219 $cpg_ot = $output_dir . $cpg_ot; | |
| 1220 | |
| 1221 if ($gzip){ | |
| 1222 $cpg_ot .= '.gz'; | |
| 1223 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); | |
| 1224 } | |
| 1225 else{ | |
| 1226 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); | |
| 1227 } | |
| 1228 | |
| 1229 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); | |
| 1230 push @sorting_files,$cpg_ot; | |
| 1231 | |
| 1232 unless($no_header){ | |
| 1233 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1234 } | |
| 1235 | |
| 1236 $cpg_ctot =~ s/^/CpG_CTOT_/; | |
| 1237 $cpg_ctot =~ s/sam$/txt/; | |
| 1238 $cpg_ctot =~ s/bam$/txt/; | |
| 1239 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); | |
| 1240 $cpg_ctot = $output_dir . $cpg_ctot; | |
| 1241 | |
| 1242 if ($gzip){ | |
| 1243 $cpg_ctot .= '.gz'; | |
| 1244 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); | |
| 1245 } | |
| 1246 else{ | |
| 1247 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); | |
| 1248 } | |
| 1249 | |
| 1250 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); | |
| 1251 push @sorting_files,$cpg_ctot; | |
| 1252 | |
| 1253 unless($no_header){ | |
| 1254 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1255 } | |
| 1256 | |
| 1257 $cpg_ctob =~ s/^/CpG_CTOB_/; | |
| 1258 $cpg_ctob =~ s/sam$/txt/; | |
| 1259 $cpg_ctob =~ s/bam$/txt/; | |
| 1260 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); | |
| 1261 $cpg_ctob = $output_dir . $cpg_ctob; | |
| 1262 | |
| 1263 if ($gzip){ | |
| 1264 $cpg_ctob .= '.gz'; | |
| 1265 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); | |
| 1266 } | |
| 1267 else{ | |
| 1268 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); | |
| 1269 } | |
| 1270 | |
| 1271 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); | |
| 1272 push @sorting_files,$cpg_ctob; | |
| 1273 | |
| 1274 unless($no_header){ | |
| 1275 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1276 } | |
| 1277 | |
| 1278 $cpg_ob =~ s/^/CpG_OB_/; | |
| 1279 $cpg_ob =~ s/sam$/txt/; | |
| 1280 $cpg_ob =~ s/bam$/txt/; | |
| 1281 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); | |
| 1282 $cpg_ob = $output_dir . $cpg_ob; | |
| 1283 | |
| 1284 if ($gzip){ | |
| 1285 $cpg_ob .= '.gz'; | |
| 1286 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); | |
| 1287 } | |
| 1288 else{ | |
| 1289 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); | |
| 1290 } | |
| 1291 | |
| 1292 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); | |
| 1293 push @sorting_files,$cpg_ob; | |
| 1294 | |
| 1295 unless($no_header){ | |
| 1296 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1297 } | |
| 1298 | |
| 1299 ### For cytosines in Non-CpG (CC, CT or CA) context | |
| 1300 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; | |
| 1301 | |
| 1302 $other_c_ot =~ s/^/Non_CpG_OT_/; | |
| 1303 $other_c_ot =~ s/sam$/txt/; | |
| 1304 $other_c_ot =~ s/bam$/txt/; | |
| 1305 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); | |
| 1306 $other_c_ot = $output_dir . $other_c_ot; | |
| 1307 | |
| 1308 if ($gzip){ | |
| 1309 $other_c_ot .= '.gz'; | |
| 1310 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); | |
| 1311 } | |
| 1312 else{ | |
| 1313 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); | |
| 1314 } | |
| 1315 | |
| 1316 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only); | |
| 1317 push @sorting_files,$other_c_ot; | |
| 1318 | |
| 1319 unless($no_header){ | |
| 1320 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1321 } | |
| 1322 | |
| 1323 $other_c_ctot =~ s/^/Non_CpG_CTOT_/; | |
| 1324 $other_c_ctot =~ s/sam$/txt/; | |
| 1325 $other_c_ctot =~ s/bam$/txt/; | |
| 1326 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); | |
| 1327 $other_c_ctot = $output_dir . $other_c_ctot; | |
| 1328 | |
| 1329 if ($gzip){ | |
| 1330 $other_c_ctot .= '.gz'; | |
| 1331 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); | |
| 1332 } | |
| 1333 else{ | |
| 1334 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); | |
| 1335 } | |
| 1336 | |
| 1337 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only); | |
| 1338 push @sorting_files,$other_c_ctot; | |
| 1339 | |
| 1340 unless($no_header){ | |
| 1341 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1342 } | |
| 1343 | |
| 1344 $other_c_ctob =~ s/^/Non_CpG_CTOB_/; | |
| 1345 $other_c_ctob =~ s/sam$/txt/; | |
| 1346 $other_c_ctob =~ s/bam$/txt/; | |
| 1347 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); | |
| 1348 $other_c_ctob = $output_dir . $other_c_ctob; | |
| 1349 | |
| 1350 if ($gzip){ | |
| 1351 $other_c_ctob .= '.gz'; | |
| 1352 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); | |
| 1353 } | |
| 1354 else{ | |
| 1355 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); | |
| 1356 } | |
| 1357 | |
| 1358 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only); | |
| 1359 push @sorting_files,$other_c_ctob; | |
| 1360 | |
| 1361 unless($no_header){ | |
| 1362 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1363 } | |
| 1364 | |
| 1365 $other_c_ob =~ s/^/Non_CpG_OB_/; | |
| 1366 $other_c_ob =~ s/sam$/txt/; | |
| 1367 $other_c_ob =~ s/sam$/txt/; | |
| 1368 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); | |
| 1369 $other_c_ob = $output_dir . $other_c_ob; | |
| 1370 | |
| 1371 if ($gzip){ | |
| 1372 $other_c_ob .= '.gz'; | |
| 1373 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); | |
| 1374 } | |
| 1375 else{ | |
| 1376 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); | |
| 1377 } | |
| 1378 | |
| 1379 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only); | |
| 1380 push @sorting_files,$other_c_ob; | |
| 1381 | |
| 1382 unless($no_header){ | |
| 1383 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1384 } | |
| 1385 } | |
| 1386 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) | |
| 1387 | |
| 1388 ### if --comprehensive was specified we are only writing one file per context | |
| 1389 elsif ($full) { | |
| 1390 my $cpg_output = my $chg_output = my $chh_output = $output_filename; | |
| 1391 ### C in CpG context | |
| 1392 $cpg_output =~ s/^/CpG_context_/; | |
| 1393 $cpg_output =~ s/sam$/txt/; | |
| 1394 $cpg_output =~ s/bam$/txt/; | |
| 1395 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); | |
| 1396 $cpg_output = $output_dir . $cpg_output; | |
| 1397 | |
| 1398 if ($gzip){ | |
| 1399 $cpg_output .= '.gz'; | |
| 1400 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); | |
| 1401 } | |
| 1402 else{ | |
| 1403 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); | |
| 1404 } | |
| 1405 | |
| 1406 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); | |
| 1407 push @sorting_files,$cpg_output; | |
| 1408 | |
| 1409 unless($no_header){ | |
| 1410 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1411 } | |
| 1412 | |
| 1413 ### C in CHG context | |
| 1414 $chg_output =~ s/^/CHG_context_/; | |
| 1415 $chg_output =~ s/sam$/txt/; | |
| 1416 $chg_output =~ s/bam$/txt/; | |
| 1417 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); | |
| 1418 $chg_output = $output_dir . $chg_output; | |
| 1419 | |
| 1420 if ($gzip){ | |
| 1421 $chg_output .= '.gz'; | |
| 1422 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only); | |
| 1423 } | |
| 1424 else{ | |
| 1425 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only); | |
| 1426 } | |
| 1427 | |
| 1428 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only); | |
| 1429 push @sorting_files,$chg_output; | |
| 1430 | |
| 1431 unless($no_header){ | |
| 1432 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1433 } | |
| 1434 | |
| 1435 ### C in CHH context | |
| 1436 $chh_output =~ s/^/CHH_context_/; | |
| 1437 $chh_output =~ s/sam$/txt/; | |
| 1438 $chh_output =~ s/bam$/txt/; | |
| 1439 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); | |
| 1440 $chh_output = $output_dir . $chh_output; | |
| 1441 | |
| 1442 if ($gzip){ | |
| 1443 $chh_output .= '.gz'; | |
| 1444 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only); | |
| 1445 } | |
| 1446 else{ | |
| 1447 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only); | |
| 1448 } | |
| 1449 | |
| 1450 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only); | |
| 1451 push @sorting_files, $chh_output; | |
| 1452 | |
| 1453 unless($no_header){ | |
| 1454 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1455 } | |
| 1456 } | |
| 1457 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found | |
| 1458 else { | |
| 1459 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; | |
| 1460 | |
| 1461 ### For cytosines in CpG context | |
| 1462 $cpg_ot =~ s/^/CpG_OT_/; | |
| 1463 $cpg_ot =~ s/sam$/txt/; | |
| 1464 $cpg_ot =~ s/bam$/txt/; | |
| 1465 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); | |
| 1466 $cpg_ot = $output_dir . $cpg_ot; | |
| 1467 | |
| 1468 if ($gzip){ | |
| 1469 $cpg_ot .= '.gz'; | |
| 1470 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); | |
| 1471 } | |
| 1472 else{ | |
| 1473 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); | |
| 1474 } | |
| 1475 | |
| 1476 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); | |
| 1477 push @sorting_files,$cpg_ot; | |
| 1478 | |
| 1479 unless($no_header){ | |
| 1480 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1481 } | |
| 1482 | |
| 1483 $cpg_ctot =~ s/^/CpG_CTOT_/; | |
| 1484 $cpg_ctot =~ s/sam$/txt/; | |
| 1485 $cpg_ctot =~ s/bam$/txt/; | |
| 1486 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); | |
| 1487 $cpg_ctot = $output_dir . $cpg_ctot; | |
| 1488 | |
| 1489 if ($gzip){ | |
| 1490 $cpg_ctot .= '.gz'; | |
| 1491 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); | |
| 1492 } | |
| 1493 else{ | |
| 1494 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); | |
| 1495 } | |
| 1496 | |
| 1497 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); | |
| 1498 push @sorting_files,$cpg_ctot; | |
| 1499 | |
| 1500 unless($no_header){ | |
| 1501 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1502 } | |
| 1503 | |
| 1504 $cpg_ctob =~ s/^/CpG_CTOB_/; | |
| 1505 $cpg_ctob =~ s/sam$/txt/; | |
| 1506 $cpg_ctob =~ s/bam$/txt/; | |
| 1507 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); | |
| 1508 $cpg_ctob = $output_dir . $cpg_ctob; | |
| 1509 | |
| 1510 if ($gzip){ | |
| 1511 $cpg_ctob .= '.gz'; | |
| 1512 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); | |
| 1513 } | |
| 1514 else{ | |
| 1515 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); | |
| 1516 } | |
| 1517 | |
| 1518 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); | |
| 1519 push @sorting_files,$cpg_ctob; | |
| 1520 | |
| 1521 unless($no_header){ | |
| 1522 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1523 } | |
| 1524 | |
| 1525 $cpg_ob =~ s/^/CpG_OB_/; | |
| 1526 $cpg_ob =~ s/sam$/txt/; | |
| 1527 $cpg_ob =~ s/bam$/txt/; | |
| 1528 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); | |
| 1529 $cpg_ob = $output_dir . $cpg_ob; | |
| 1530 | |
| 1531 if ($gzip){ | |
| 1532 $cpg_ob .= '.gz'; | |
| 1533 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); | |
| 1534 } | |
| 1535 else{ | |
| 1536 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); | |
| 1537 } | |
| 1538 | |
| 1539 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); | |
| 1540 push @sorting_files,$cpg_ob; | |
| 1541 | |
| 1542 unless($no_header){ | |
| 1543 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1544 } | |
| 1545 | |
| 1546 ### For cytosines in CHG context | |
| 1547 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; | |
| 1548 | |
| 1549 $chg_ot =~ s/^/CHG_OT_/; | |
| 1550 $chg_ot =~ s/sam$/txt/; | |
| 1551 $chg_ot =~ s/bam$/txt/; | |
| 1552 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); | |
| 1553 $chg_ot = $output_dir . $chg_ot; | |
| 1554 | |
| 1555 if ($gzip){ | |
| 1556 $chg_ot .= '.gz'; | |
| 1557 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only); | |
| 1558 } | |
| 1559 else{ | |
| 1560 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only); | |
| 1561 } | |
| 1562 | |
| 1563 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only); | |
| 1564 push @sorting_files,$chg_ot; | |
| 1565 | |
| 1566 unless($no_header){ | |
| 1567 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1568 } | |
| 1569 | |
| 1570 $chg_ctot =~ s/^/CHG_CTOT_/; | |
| 1571 $chg_ctot =~ s/sam$/txt/; | |
| 1572 $chg_ctot =~ s/bam$/txt/; | |
| 1573 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); | |
| 1574 $chg_ctot = $output_dir . $chg_ctot; | |
| 1575 | |
| 1576 if ($gzip){ | |
| 1577 $chg_ctot .= '.gz'; | |
| 1578 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); | |
| 1579 } | |
| 1580 else{ | |
| 1581 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); | |
| 1582 } | |
| 1583 | |
| 1584 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only); | |
| 1585 push @sorting_files,$chg_ctot; | |
| 1586 | |
| 1587 unless($no_header){ | |
| 1588 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1589 } | |
| 1590 | |
| 1591 $chg_ctob =~ s/^/CHG_CTOB_/; | |
| 1592 $chg_ctob =~ s/sam$/txt/; | |
| 1593 $chg_ctob =~ s/bam$/txt/; | |
| 1594 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); | |
| 1595 $chg_ctob = $output_dir . $chg_ctob; | |
| 1596 | |
| 1597 if ($gzip){ | |
| 1598 $chg_ctob .= '.gz'; | |
| 1599 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); | |
| 1600 } | |
| 1601 else{ | |
| 1602 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); | |
| 1603 } | |
| 1604 | |
| 1605 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only); | |
| 1606 push @sorting_files,$chg_ctob; | |
| 1607 | |
| 1608 unless($no_header){ | |
| 1609 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1610 } | |
| 1611 | |
| 1612 $chg_ob =~ s/^/CHG_OB_/; | |
| 1613 $chg_ob =~ s/sam$/txt/; | |
| 1614 $chg_ob =~ s/bam$/txt/; | |
| 1615 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); | |
| 1616 $chg_ob = $output_dir . $chg_ob; | |
| 1617 | |
| 1618 if ($gzip){ | |
| 1619 $chg_ob .= '.gz'; | |
| 1620 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only); | |
| 1621 } | |
| 1622 else{ | |
| 1623 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only); | |
| 1624 } | |
| 1625 | |
| 1626 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only); | |
| 1627 push @sorting_files,$chg_ob; | |
| 1628 | |
| 1629 unless($no_header){ | |
| 1630 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1631 } | |
| 1632 | |
| 1633 ### For cytosines in CHH context | |
| 1634 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; | |
| 1635 | |
| 1636 $chh_ot =~ s/^/CHH_OT_/; | |
| 1637 $chh_ot =~ s/sam$/txt/; | |
| 1638 $chh_ot =~ s/bam$/txt/; | |
| 1639 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); | |
| 1640 $chh_ot = $output_dir . $chh_ot; | |
| 1641 | |
| 1642 if ($gzip){ | |
| 1643 $chh_ot .= '.gz'; | |
| 1644 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only); | |
| 1645 } | |
| 1646 else{ | |
| 1647 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only); | |
| 1648 } | |
| 1649 | |
| 1650 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only); | |
| 1651 push @sorting_files,$chh_ot; | |
| 1652 | |
| 1653 unless($no_header){ | |
| 1654 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1655 } | |
| 1656 | |
| 1657 $chh_ctot =~ s/^/CHH_CTOT_/; | |
| 1658 $chh_ctot =~ s/sam$/txt/; | |
| 1659 $chh_ctot =~ s/bam$/txt/; | |
| 1660 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); | |
| 1661 $chh_ctot = $output_dir . $chh_ctot; | |
| 1662 | |
| 1663 if ($gzip){ | |
| 1664 $chh_ctot .= '.gz'; | |
| 1665 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); | |
| 1666 } | |
| 1667 else{ | |
| 1668 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); | |
| 1669 } | |
| 1670 | |
| 1671 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only); | |
| 1672 push @sorting_files,$chh_ctot; | |
| 1673 | |
| 1674 unless($no_header){ | |
| 1675 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1676 } | |
| 1677 | |
| 1678 $chh_ctob =~ s/^/CHH_CTOB_/; | |
| 1679 $chh_ctob =~ s/sam$/txt/; | |
| 1680 $chh_ctob =~ s/bam$/txt/; | |
| 1681 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); | |
| 1682 $chh_ctob = $output_dir . $chh_ctob; | |
| 1683 | |
| 1684 if ($gzip){ | |
| 1685 $chh_ctob .= '.gz'; | |
| 1686 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); | |
| 1687 } | |
| 1688 else{ | |
| 1689 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); | |
| 1690 } | |
| 1691 | |
| 1692 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only); | |
| 1693 push @sorting_files,$chh_ctob; | |
| 1694 | |
| 1695 unless($no_header){ | |
| 1696 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1697 } | |
| 1698 | |
| 1699 $chh_ob =~ s/^/CHH_OB_/; | |
| 1700 $chh_ob =~ s/sam$/txt/; | |
| 1701 $chh_ob =~ s/bam$/txt/; | |
| 1702 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); | |
| 1703 $chh_ob = $output_dir . $chh_ob; | |
| 1704 | |
| 1705 if ($gzip){ | |
| 1706 $chh_ob .= '.gz'; | |
| 1707 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only); | |
| 1708 } | |
| 1709 else{ | |
| 1710 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only); | |
| 1711 } | |
| 1712 | |
| 1713 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only); | |
| 1714 push @sorting_files,$chh_ob; | |
| 1715 | |
| 1716 unless($no_header){ | |
| 1717 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
| 1718 } | |
| 1719 } | |
| 1720 | |
| 1721 my $methylation_call_strings_processed = 0; | |
| 1722 my $line_count = 0; | |
| 1723 | |
| 1724 ### proceeding differently now for single-end or paired-end Bismark files | |
| 1725 | |
| 1726 ### PROCESSING SINGLE-END RESULT FILES | |
| 1727 if ($single) { | |
| 1728 | |
| 1729 ### also proceeding differently now for SAM format or vanilla Bismark format files | |
| 1730 if ($vanilla) { # old vanilla Bismark output format | |
| 1731 while (<IN>) { | |
| 1732 ++$line_count; | |
| 1733 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
| 1734 | |
| 1735 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) | |
| 1736 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9]; | |
| 1737 | |
| 1738 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or | |
| 1739 ### last position | |
| 1740 chomp $genome_conversion; | |
| 1741 | |
| 1742 my $index; | |
| 1743 if ($meth_call) { | |
| 1744 | |
| 1745 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand | |
| 1746 $index = 0; | |
| 1747 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand | |
| 1748 $index = 1; | |
| 1749 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand | |
| 1750 $index = 3; | |
| 1751 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand | |
| 1752 $index = 2; | |
| 1753 } else { | |
| 1754 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; | |
| 1755 } | |
| 1756 | |
| 1757 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> | |
| 1758 if ($ignore) { | |
| 1759 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); | |
| 1760 | |
| 1761 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! | |
| 1762 if ($strand eq '+') { | |
| 1763 $start += $ignore; | |
| 1764 } elsif ($strand eq '-') { | |
| 1765 $start += length($meth_call)-1; ## $meth_call is already shortened! | |
| 1766 } else { | |
| 1767 die "Alignment did not have proper strand information: $strand\n"; | |
| 1768 } | |
| 1769 } | |
| 1770 ### printing out the methylation state of every C in the read | |
| 1771 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); | |
| 1772 | |
| 1773 ++$methylation_call_strings_processed; # 1 per single-end result | |
| 1774 } | |
| 1775 } | |
| 1776 } else { # processing single-end SAM format (default) | |
| 1777 while (<IN>) { | |
| 1778 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
| 1779 if (/^\@/) { # skipping header lines (starting with @) | |
| 1780 warn "skipping SAM header line:\t$_"; | |
| 1781 next; | |
| 1782 } | |
| 1783 | |
| 1784 ++$line_count; | |
| 1785 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
| 1786 | |
| 1787 # example read in SAM format | |
| 1788 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT | |
| 1789 ### | |
| 1790 | |
| 1791 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
| 1792 # < 0.7.6 $meth_call =~ s/^XM:Z://; | |
| 1793 # < 0.7.6 $read_conversion =~ s/^XR:Z://; | |
| 1794 # < 0.7.6 $genome_conversion =~ s/^XG:Z://; | |
| 1795 | |
| 1796 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5]; | |
| 1797 | |
| 1798 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
| 1799 my $meth_call; ### Thanks to Zachary Zeno for this solution | |
| 1800 my $read_conversion; | |
| 1801 my $genome_conversion; | |
| 1802 | |
| 1803 while ( /(XM|XR|XG):Z:([^\t]+)/g ) { | |
| 1804 my $tag = $1; | |
| 1805 my $value = $2; | |
| 1806 | |
| 1807 if ($tag eq "XM") { | |
| 1808 $meth_call = $value; | |
| 1809 $meth_call =~ s/\r//; | |
| 1810 } elsif ($tag eq "XR") { | |
| 1811 $read_conversion = $value; | |
| 1812 $read_conversion =~ s/\r//; | |
| 1813 } elsif ($tag eq "XG") { | |
| 1814 $genome_conversion = $value; | |
| 1815 $genome_conversion =~ s/\r//; | |
| 1816 } | |
| 1817 } | |
| 1818 | |
| 1819 my $strand; | |
| 1820 chomp $genome_conversion; | |
| 1821 # print "$meth_call\n$read_conversion\n$genome_conversion\n"; | |
| 1822 | |
| 1823 my $index; | |
| 1824 if ($meth_call) { | |
| 1825 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand | |
| 1826 $index = 0; | |
| 1827 $strand = '+'; | |
| 1828 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand | |
| 1829 $index = 1; | |
| 1830 $strand = '-'; | |
| 1831 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand | |
| 1832 $index = 2; | |
| 1833 $strand = '+'; | |
| 1834 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand | |
| 1835 $index = 3; | |
| 1836 $strand = '-'; | |
| 1837 } else { | |
| 1838 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; | |
| 1839 } | |
| 1840 | |
| 1841 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output | |
| 1842 if ($strand eq '-') { | |
| 1843 $meth_call = reverse $meth_call; | |
| 1844 } | |
| 1845 | |
| 1846 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> | |
| 1847 if ($ignore) { | |
| 1848 # print "\n\n$meth_call\n"; | |
| 1849 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); | |
| 1850 # print "$meth_call\n"; | |
| 1851 | |
| 1852 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
| 1853 | |
| 1854 my @len = split (/\D+/,$cigar); # storing the length per operation | |
| 1855 my @ops = split (/\d+/,$cigar); # storing the operation | |
| 1856 shift @ops; # remove the empty first element | |
| 1857 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
| 1858 | |
| 1859 my @comp_cigar; # building an array with all CIGAR operations | |
| 1860 foreach my $index (0..$#len) { | |
| 1861 foreach (1..$len[$index]) { | |
| 1862 # print "$ops[$index]"; | |
| 1863 push @comp_cigar, $ops[$index]; | |
| 1864 } | |
| 1865 } | |
| 1866 # print "original CIGAR: $cigar\n"; | |
| 1867 # print "original CIGAR: @comp_cigar\n"; | |
| 1868 | |
| 1869 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! | |
| 1870 if ($strand eq '+') { | |
| 1871 | |
| 1872 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions | |
| 1873 my $I_count = 0; | |
| 1874 | |
| 1875 for (1..$ignore) { | |
| 1876 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start | |
| 1877 # print "$_ deleted $op\n"; | |
| 1878 | |
| 1879 while ($op eq 'D') { # repeating this for deletions (D) | |
| 1880 $D_count++; | |
| 1881 $op = shift @comp_cigar; | |
| 1882 # print "$_ deleted $op\n"; | |
| 1883 } | |
| 1884 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
| 1885 $I_count++; | |
| 1886 } | |
| 1887 } | |
| 1888 $start += $ignore + $D_count - $I_count; | |
| 1889 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n"; | |
| 1890 } elsif ($strand eq '-') { | |
| 1891 | |
| 1892 for (1..$ignore) { | |
| 1893 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
| 1894 while ($op eq 'D') { # repeating this for deletions (D) | |
| 1895 $op = pop @comp_cigar; | |
| 1896 } | |
| 1897 } | |
| 1898 | |
| 1899 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR | |
| 1900 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence | |
| 1901 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position | |
| 1902 foreach (@comp_cigar) { | |
| 1903 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); | |
| 1904 } | |
| 1905 $start += $MD_count - 1; | |
| 1906 } | |
| 1907 | |
| 1908 ### reconstituting shortened CIGAR string | |
| 1909 my $new_cigar; | |
| 1910 my $count = 0; | |
| 1911 my $last_op; | |
| 1912 # print "ignore adjusted: @comp_cigar\n"; | |
| 1913 foreach my $op (@comp_cigar) { | |
| 1914 unless (defined $last_op){ | |
| 1915 $last_op = $op; | |
| 1916 ++$count; | |
| 1917 next; | |
| 1918 } | |
| 1919 if ($last_op eq $op) { | |
| 1920 ++$count; | |
| 1921 } else { | |
| 1922 $new_cigar .= "$count$last_op"; | |
| 1923 $last_op = $op; | |
| 1924 $count = 1; | |
| 1925 } | |
| 1926 } | |
| 1927 $new_cigar .= "$count$last_op"; # appending the last operation and count | |
| 1928 $cigar = $new_cigar; | |
| 1929 # print "ignore adjusted scalar: $cigar\n"; | |
| 1930 } | |
| 1931 } | |
| 1932 ### printing out the methylation state of every C in the read | |
| 1933 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar); | |
| 1934 | |
| 1935 ++$methylation_call_strings_processed; # 1 per single-end result | |
| 1936 } | |
| 1937 } | |
| 1938 } | |
| 1939 | |
| 1940 ### PROCESSING PAIRED-END RESULT FILES | |
| 1941 elsif ($paired) { | |
| 1942 | |
| 1943 ### proceeding differently now for SAM format or vanilla Bismark format files | |
| 1944 if ($vanilla) { # old vanilla Bismark paired-end output format | |
| 1945 while (<IN>) { | |
| 1946 ++$line_count; | |
| 1947 warn "processed line: $line_count\n" if ($line_count%500000==0); | |
| 1948 | |
| 1949 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) | |
| 1950 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13]; | |
| 1951 | |
| 1952 my $index; | |
| 1953 chomp $genome_conversion; | |
| 1954 | |
| 1955 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { | |
| 1956 $index = 0; ## this is OT | |
| 1957 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { | |
| 1958 $index = 2; ## this is CTOB!!! | |
| 1959 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { | |
| 1960 $index = 1; ## this is CTOT!!! | |
| 1961 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { | |
| 1962 $index = 3; ## this is OB | |
| 1963 } else { | |
| 1964 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | |
| 1965 } | |
| 1966 | |
| 1967 if ($meth_call_1 and $meth_call_2) { | |
| 1968 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' | |
| 1969 | |
| 1970 if ($ignore) { | |
| 1971 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); | |
| 1972 | |
| 1973 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified | |
| 1974 $start_read_1 += $ignore; | |
| 1975 } | |
| 1976 if ($ignore_r2) { | |
| 1977 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); | |
| 1978 | |
| 1979 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified | |
| 1980 $end_read_2 -= $ignore_r2; | |
| 1981 } | |
| 1982 | |
| 1983 my $end_read_1; | |
| 1984 my $start_read_2; | |
| 1985 | |
| 1986 if ($strand eq '+') { | |
| 1987 | |
| 1988 $end_read_1 = $start_read_1+length($meth_call_1)-1; | |
| 1989 $start_read_2 = $end_read_2-length($meth_call_2)+1; | |
| 1990 | |
| 1991 ## we first pass the first read which is in + orientation on the forward strand | |
| 1992 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity | |
| 1993 | |
| 1994 # we next pass the second read which is in - orientation on the reverse strand | |
| 1995 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | |
| 1996 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2); | |
| 1997 } | |
| 1998 else { | |
| 1999 | |
| 2000 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! | |
| 2001 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! | |
| 2002 | |
| 2003 ## we first pass the first read which is in - orientation on the reverse strand | |
| 2004 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1); | |
| 2005 | |
| 2006 # we next pass the second read which is in + orientation on the forward strand | |
| 2007 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | |
| 2008 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2); | |
| 2009 } | |
| 2010 | |
| 2011 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | |
| 2012 } | |
| 2013 } | |
| 2014 } | |
| 2015 else { # Bismark paired-end SAM output format (default) | |
| 2016 while (<IN>) { | |
| 2017 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
| 2018 if (/^\@/) { # skipping header lines (starting with @) | |
| 2019 warn "skipping SAM header line:\t$_"; | |
| 2020 next; | |
| 2021 } | |
| 2022 | |
| 2023 ++$line_count; | |
| 2024 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
| 2025 | |
| 2026 # example paired-end reads in SAM format (2 consecutive lines) | |
| 2027 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT | |
| 2028 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT | |
| 2029 | |
| 2030 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
| 2031 | |
| 2032 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
| 2033 my $meth_call_1; | |
| 2034 my $first_read_conversion; | |
| 2035 my $genome_conversion; | |
| 2036 | |
| 2037 while ( /(XM|XR|XG):Z:([^\t]+)/g ) { | |
| 2038 my $tag = $1; | |
| 2039 my $value = $2; | |
| 2040 | |
| 2041 if ($tag eq "XM") { | |
| 2042 $meth_call_1 = $value; | |
| 2043 $meth_call_1 =~ s/\r//; | |
| 2044 } elsif ($tag eq "XR") { | |
| 2045 $first_read_conversion = $value; | |
| 2046 $first_read_conversion =~ s/\r//; | |
| 2047 } elsif ($tag eq "XG") { | |
| 2048 $genome_conversion = $value; | |
| 2049 $genome_conversion =~ s/\r//; | |
| 2050 } | |
| 2051 } | |
| 2052 | |
| 2053 $_ = <IN>; # reading in the paired read | |
| 2054 | |
| 2055 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14]; | |
| 2056 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://; | |
| 2057 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://; | |
| 2058 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://; | |
| 2059 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://; | |
| 2060 | |
| 2061 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
| 2062 | |
| 2063 my $meth_call_2; | |
| 2064 my $second_read_conversion; | |
| 2065 | |
| 2066 while ( /(XM|XR):Z:([^\t]+)/g ) { | |
| 2067 my $tag = $1; | |
| 2068 my $value = $2; | |
| 2069 | |
| 2070 if ($tag eq "XM") { | |
| 2071 $meth_call_2 = $value; | |
| 2072 $meth_call_2 =~ s/\r//; | |
| 2073 } elsif ($tag eq "XR") { | |
| 2074 $second_read_conversion = $value; | |
| 2075 $second_read_conversion = s/\r//; | |
| 2076 } | |
| 2077 } | |
| 2078 | |
| 2079 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://; | |
| 2080 chomp $genome_conversion; # in case it captured a new line character | |
| 2081 | |
| 2082 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n"; | |
| 2083 | |
| 2084 my $index; | |
| 2085 my $strand; | |
| 2086 | |
| 2087 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { | |
| 2088 $index = 0; ## this is OT | |
| 2089 $strand = '+'; | |
| 2090 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { | |
| 2091 $index = 1; ## this is CTOT | |
| 2092 $strand = '-'; | |
| 2093 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { | |
| 2094 $index = 2; ## this is CTOB | |
| 2095 $strand = '+'; | |
| 2096 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { | |
| 2097 $index = 3; ## this is OB | |
| 2098 $strand = '-'; | |
| 2099 } else { | |
| 2100 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | |
| 2101 } | |
| 2102 | |
| 2103 ### reversing the methylation call of the read that was reverse-complemented | |
| 2104 if ($strand eq '+') { | |
| 2105 $meth_call_2 = reverse $meth_call_2; | |
| 2106 } else { | |
| 2107 $meth_call_1 = reverse $meth_call_1; | |
| 2108 } | |
| 2109 | |
| 2110 if ($meth_call_1 and $meth_call_2) { | |
| 2111 | |
| 2112 my $end_read_1; | |
| 2113 | |
| 2114 ### READ 1 | |
| 2115 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation | |
| 2116 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation | |
| 2117 shift @ops_1; # remove the empty first element | |
| 2118 | |
| 2119 die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1); | |
| 2120 | |
| 2121 my @comp_cigar_1; # building an array with all CIGAR operations | |
| 2122 foreach my $index (0..$#len_1) { | |
| 2123 foreach (1..$len_1[$index]) { | |
| 2124 # print "$ops_1[$index]"; | |
| 2125 push @comp_cigar_1, $ops_1[$index]; | |
| 2126 } | |
| 2127 } | |
| 2128 # print "original CIGAR read 1: $cigar_1\n"; | |
| 2129 # print "original CIGAR read 1: @comp_cigar_1\n"; | |
| 2130 | |
| 2131 ### READ 2 | |
| 2132 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation | |
| 2133 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation | |
| 2134 shift @ops_2; # remove the empty first element | |
| 2135 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); | |
| 2136 my @comp_cigar_2; # building an array with all CIGAR operations for read 2 | |
| 2137 foreach my $index (0..$#len_2) { | |
| 2138 foreach (1..$len_2[$index]) { | |
| 2139 # print "$ops_2[$index]"; | |
| 2140 push @comp_cigar_2, $ops_2[$index]; | |
| 2141 } | |
| 2142 } | |
| 2143 # print "original CIGAR read 2: $cigar_2\n"; | |
| 2144 # print "original CIGAR read 2: @comp_cigar_2\n"; | |
| 2145 | |
| 2146 | |
| 2147 | |
| 2148 if ($ignore) { | |
| 2149 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' for read 1 | |
| 2150 ### the methylation calls have already been reversed where necessary | |
| 2151 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); | |
| 2152 | |
| 2153 if ($strand eq '+') { | |
| 2154 | |
| 2155 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start | |
| 2156 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions | |
| 2157 my $I_count_1 = 0; | |
| 2158 | |
| 2159 for (1..$ignore) { | |
| 2160 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start | |
| 2161 # print "$_ deleted $op\n"; | |
| 2162 | |
| 2163 while ($op eq 'D') { # repeating this for deletions (D) | |
| 2164 $D_count_1++; | |
| 2165 $op = shift @comp_cigar_1; | |
| 2166 # print "$_ deleted $op\n"; | |
| 2167 } | |
| 2168 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
| 2169 $I_count_1++; | |
| 2170 } | |
| 2171 } | |
| 2172 | |
| 2173 $start_read_1 += $ignore + $D_count_1 - $I_count_1; | |
| 2174 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; | |
| 2175 | |
| 2176 # the start position of reads mapping to the reverse strand is being adjusted further below | |
| 2177 } | |
| 2178 elsif ($strand eq '-') { | |
| 2179 | |
| 2180 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back | |
| 2181 for (1..$ignore) { | |
| 2182 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
| 2183 while ($op eq 'D') { # repeating this for deletions (D) | |
| 2184 $op = pop @comp_cigar_1; | |
| 2185 } | |
| 2186 } | |
| 2187 # the start position of reads mapping to the reverse strand is being adjusted further below | |
| 2188 | |
| 2189 } | |
| 2190 } | |
| 2191 | |
| 2192 if ($ignore_r2) { | |
| 2193 ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore_r2 <int>' for read 2 | |
| 2194 ### the methylation calls have already been reversed where necessary | |
| 2195 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); | |
| 2196 | |
| 2197 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
| 2198 | |
| 2199 if ($strand eq '+') { | |
| 2200 | |
| 2201 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back | |
| 2202 | |
| 2203 for (1..$ignore_r2) { | |
| 2204 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
| 2205 while ($op eq 'D') { # repeating this for deletions (D) | |
| 2206 $op = pop @comp_cigar_2; | |
| 2207 } | |
| 2208 } | |
| 2209 # the start position of reads mapping to the reverse strand is being adjusted further below | |
| 2210 } | |
| 2211 elsif ($strand eq '-') { | |
| 2212 | |
| 2213 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start | |
| 2214 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions | |
| 2215 my $I_count_2 = 0; | |
| 2216 | |
| 2217 for (1..$ignore_r2) { | |
| 2218 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start | |
| 2219 # print "$_ deleted $op\n"; | |
| 2220 | |
| 2221 while ($op eq 'D') { # repeating this for deletions (D) | |
| 2222 $D_count_2++; | |
| 2223 $op = shift @comp_cigar_2; | |
| 2224 # print "$_ deleted $op\n"; | |
| 2225 } | |
| 2226 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
| 2227 $I_count_2++; | |
| 2228 } | |
| 2229 } | |
| 2230 | |
| 2231 $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2; | |
| 2232 # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; | |
| 2233 } | |
| 2234 } | |
| 2235 | |
| 2236 if ($ignore){ | |
| 2237 ### reconstituting shortened CIGAR string 1 | |
| 2238 my $new_cigar_1; | |
| 2239 my $count_1 = 0; | |
| 2240 my $last_op_1; | |
| 2241 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; | |
| 2242 foreach my $op (@comp_cigar_1) { | |
| 2243 unless (defined $last_op_1){ | |
| 2244 $last_op_1 = $op; | |
| 2245 ++$count_1; | |
| 2246 next; | |
| 2247 } | |
| 2248 if ($last_op_1 eq $op) { | |
| 2249 ++$count_1; | |
| 2250 } else { | |
| 2251 $new_cigar_1 .= "$count_1$last_op_1"; | |
| 2252 $last_op_1 = $op; | |
| 2253 $count_1 = 1; | |
| 2254 } | |
| 2255 } | |
| 2256 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count | |
| 2257 $cigar_1 = $new_cigar_1; | |
| 2258 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; | |
| 2259 } | |
| 2260 | |
| 2261 if ($ignore_r2){ | |
| 2262 | |
| 2263 ### reconstituting shortened CIGAR string 2 | |
| 2264 my $new_cigar_2; | |
| 2265 my $count_2 = 0; | |
| 2266 my $last_op_2; | |
| 2267 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n"; | |
| 2268 foreach my $op (@comp_cigar_2) { | |
| 2269 unless (defined $last_op_2){ | |
| 2270 $last_op_2 = $op; | |
| 2271 ++$count_2; | |
| 2272 next; | |
| 2273 } | |
| 2274 if ($last_op_2 eq $op) { | |
| 2275 ++$count_2; | |
| 2276 } | |
| 2277 else { | |
| 2278 $new_cigar_2 .= "$count_2$last_op_2"; | |
| 2279 $last_op_2 = $op; | |
| 2280 $count_2 = 1; | |
| 2281 } | |
| 2282 } | |
| 2283 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count | |
| 2284 $cigar_2 = $new_cigar_2; | |
| 2285 # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n"; | |
| 2286 } | |
| 2287 | |
| 2288 ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on | |
| 2289 | |
| 2290 if ($strand eq '+') { | |
| 2291 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 | |
| 2292 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
| 2293 # print "reverse: @comp_cigar_2\n"; | |
| 2294 | |
| 2295 my $MD_count_1 = 0; | |
| 2296 foreach (@comp_cigar_1) { | |
| 2297 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
| 2298 } | |
| 2299 | |
| 2300 my $MD_count_2 = 0; | |
| 2301 foreach (@comp_cigar_2) { | |
| 2302 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
| 2303 } | |
| 2304 | |
| 2305 $end_read_1 = $start_read_1 + $MD_count_1 - 1; | |
| 2306 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand | |
| 2307 } | |
| 2308 else { | |
| 2309 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 | |
| 2310 | |
| 2311 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
| 2312 # print "reverse: @comp_cigar_1\n"; | |
| 2313 | |
| 2314 my $MD_count_1 = 0; | |
| 2315 foreach (@comp_cigar_1) { | |
| 2316 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
| 2317 } | |
| 2318 | |
| 2319 $end_read_1 = $start_read_1; | |
| 2320 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand | |
| 2321 } | |
| 2322 | |
| 2323 if ($strand eq '+') { | |
| 2324 ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity | |
| 2325 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1); | |
| 2326 | |
| 2327 # we next pass the second read which is in - orientation on the reverse strand | |
| 2328 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | |
| 2329 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2); | |
| 2330 } else { | |
| 2331 ## we first pass the first read which is in - orientation on the reverse strand | |
| 2332 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1); | |
| 2333 | |
| 2334 # we next pass the second read which is in + orientation on the forward strand | |
| 2335 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | |
| 2336 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2); | |
| 2337 } | |
| 2338 | |
| 2339 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | |
| 2340 } | |
| 2341 } | |
| 2342 } | |
| 2343 } else { | |
| 2344 die "Single-end or paired-end reads not specified properly\n"; | |
| 2345 } | |
| 2346 | |
| 2347 warn "\n\nProcessed $line_count lines from $filename in total\n"; | |
| 2348 warn "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; | |
| 2349 if ($report) { | |
| 2350 print REPORT "\n\nProcessed $line_count lines from $filename in total\n"; | |
| 2351 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; | |
| 2352 } | |
| 2353 print_splitting_report (); | |
| 2354 } | |
| 2355 | |
| 2356 | |
| 2357 | |
| 2358 sub print_splitting_report{ | |
| 2359 | |
| 2360 ### Calculating methylation percentages if applicable | |
| 2361 | |
| 2362 my $percent_meCpG; | |
| 2363 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | |
| 2364 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | |
| 2365 } | |
| 2366 | |
| 2367 my $percent_meCHG; | |
| 2368 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
| 2369 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | |
| 2370 } | |
| 2371 | |
| 2372 my $percent_meCHH; | |
| 2373 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ | |
| 2374 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); | |
| 2375 } | |
| 2376 | |
| 2377 my $percent_non_CpG_methylation; | |
| 2378 if ($merge_non_CpG){ | |
| 2379 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
| 2380 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); | |
| 2381 } | |
| 2382 } | |
| 2383 | |
| 2384 if ($report){ | |
| 2385 ### detailed information about Cs analysed | |
| 2386 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
| 2387 | |
| 2388 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
| 2389 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
| 2390 | |
| 2391 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
| 2392 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
| 2393 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | |
| 2394 | |
| 2395 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
| 2396 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
| 2397 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | |
| 2398 | |
| 2399 ### calculating methylated CpG percentage if applicable | |
| 2400 if ($percent_meCpG){ | |
| 2401 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
| 2402 } | |
| 2403 else{ | |
| 2404 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
| 2405 } | |
| 2406 | |
| 2407 ### 2-Context Output | |
| 2408 if ($merge_non_CpG){ | |
| 2409 if ($percent_non_CpG_methylation){ | |
| 2410 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; | |
| 2411 } | |
| 2412 else{ | |
| 2413 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; | |
| 2414 } | |
| 2415 } | |
| 2416 | |
| 2417 ### 3 Context Output | |
| 2418 else{ | |
| 2419 ### calculating methylated CHG percentage if applicable | |
| 2420 if ($percent_meCHG){ | |
| 2421 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
| 2422 } | |
| 2423 else{ | |
| 2424 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
| 2425 } | |
| 2426 | |
| 2427 ### calculating methylated CHH percentage if applicable | |
| 2428 if ($percent_meCHH){ | |
| 2429 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | |
| 2430 } | |
| 2431 else{ | |
| 2432 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | |
| 2433 } | |
| 2434 } | |
| 2435 } | |
| 2436 | |
| 2437 ### detailed information about Cs analysed for on-screen report | |
| 2438 print "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
| 2439 | |
| 2440 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
| 2441 print "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
| 2442 | |
| 2443 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
| 2444 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
| 2445 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | |
| 2446 | |
| 2447 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
| 2448 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
| 2449 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | |
| 2450 | |
| 2451 ### printing methylated CpG percentage if applicable | |
| 2452 if ($percent_meCpG){ | |
| 2453 print "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
| 2454 } | |
| 2455 else{ | |
| 2456 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
| 2457 } | |
| 2458 | |
| 2459 ### 2-Context Output | |
| 2460 if ($merge_non_CpG){ | |
| 2461 if ($percent_non_CpG_methylation){ | |
| 2462 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; | |
| 2463 } | |
| 2464 else{ | |
| 2465 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; | |
| 2466 } | |
| 2467 } | |
| 2468 | |
| 2469 ### 3-Context Output | |
| 2470 else{ | |
| 2471 ### printing methylated CHG percentage if applicable | |
| 2472 if ($percent_meCHG){ | |
| 2473 print "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
| 2474 } | |
| 2475 else{ | |
| 2476 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
| 2477 } | |
| 2478 | |
| 2479 ### printing methylated CHH percentage if applicable | |
| 2480 if ($percent_meCHH){ | |
| 2481 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | |
| 2482 } | |
| 2483 else{ | |
| 2484 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | |
| 2485 } | |
| 2486 } | |
| 2487 } | |
| 2488 | |
| 2489 | |
| 2490 | |
| 2491 sub print_individual_C_methylation_states_paired_end_files{ | |
| 2492 | |
| 2493 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_; | |
| 2494 | |
| 2495 ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2 | |
| 2496 die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2); | |
| 2497 | |
| 2498 my @methylation_calls = split(//,$meth_call); | |
| 2499 | |
| 2500 ################################################################# | |
| 2501 ### . for bases not involving cytosines ### | |
| 2502 ### X for methylated C in CHG context (was protected) ### | |
| 2503 ### x for not methylated C in CHG context (was converted) ### | |
| 2504 ### H for methylated C in CHH context (was protected) ### | |
| 2505 ### h for not methylated C in CHH context (was converted) ### | |
| 2506 ### Z for methylated C in CpG context (was protected) ### | |
| 2507 ### z for not methylated C in CpG context (was converted) ### | |
| 2508 ### U for methylated C in Unknown context (was protected) ### | |
| 2509 ### u for not methylated C in Unknown context (was converted) ### | |
| 2510 ################################################################# | |
| 2511 | |
| 2512 my $methyl_CHG_count = 0; | |
| 2513 my $methyl_CHH_count = 0; | |
| 2514 my $methyl_CpG_count = 0; | |
| 2515 my $unmethylated_CHG_count = 0; | |
| 2516 my $unmethylated_CHH_count = 0; | |
| 2517 my $unmethylated_CpG_count = 0; | |
| 2518 | |
| 2519 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | |
| 2520 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | |
| 2521 my @comp_cigar; | |
| 2522 | |
| 2523 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing | |
| 2524 if ($cigar =~ /^\d+M$/){ | |
| 2525 # this check speeds up the extraction process by up to 60%!!! | |
| 2526 } | |
| 2527 else{ # parsing CIGAR string | |
| 2528 my @len; | |
| 2529 my @ops; | |
| 2530 @len = split (/\D+/,$cigar); # storing the length per operation | |
| 2531 @ops = split (/\d+/,$cigar); # storing the operation | |
| 2532 shift @ops; # remove the empty first element | |
| 2533 | |
| 2534 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
| 2535 | |
| 2536 foreach my $index (0..$#len){ | |
| 2537 foreach (1..$len[$index]){ | |
| 2538 # print "$ops[$index]"; | |
| 2539 push @comp_cigar, $ops[$index]; | |
| 2540 } | |
| 2541 } | |
| 2542 # warn "\nDetected CIGAR string: $cigar\n"; | |
| 2543 # warn "Length of methylation call: ",length $meth_call,"\n"; | |
| 2544 # warn "number of operations: ",scalar @ops,"\n"; | |
| 2545 # warn "number of length digits: ",scalar @len,"\n\n"; | |
| 2546 # print @comp_cigar,"\n"; | |
| 2547 # print "$meth_call\n\n"; | |
| 2548 # sleep (1); | |
| 2549 } | |
| 2550 | |
| 2551 if ($strand eq '-') { | |
| 2552 | |
| 2553 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above | |
| 2554 if (@comp_cigar){ | |
| 2555 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
| 2556 } | |
| 2557 # print "reverse CIGAR string: @comp_cigar\n"; | |
| 2558 | |
| 2559 ### the start position of paired-end files has already been corrected, see above | |
| 2560 } | |
| 2561 | |
| 2562 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified | |
| 2563 | |
| 2564 if ($merge_non_CpG) { | |
| 2565 if ($no_overlap) { # this has to be read 2... | |
| 2566 | |
| 2567 ### single-file CpG and non-CpG context output | |
| 2568 if ($full) { | |
| 2569 if ($strand eq '+') { | |
| 2570 for my $index (0..$#methylation_calls) { | |
| 2571 | |
| 2572 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 2573 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 2574 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
| 2575 $cigar_offset += $cigar_mod; | |
| 2576 $pos_offset += $pos_mod; | |
| 2577 } | |
| 2578 | |
| 2579 ### Returning as soon as the methylation calls start overlapping | |
| 2580 if ($start+$index+$pos_offset >= $end_read_1) { | |
| 2581 return; | |
| 2582 } | |
| 2583 | |
| 2584 if ($methylation_calls[$index] eq 'X') { | |
| 2585 $counting{total_meCHG_count}++; | |
| 2586 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2587 if ($read_identity == 1){ | |
| 2588 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 2589 } | |
| 2590 else{ | |
| 2591 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 2592 } | |
| 2593 } | |
| 2594 elsif ($methylation_calls[$index] eq 'x') { | |
| 2595 $counting{total_unmethylated_CHG_count}++; | |
| 2596 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2597 if ($read_identity == 1){ | |
| 2598 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 2599 } | |
| 2600 else{ | |
| 2601 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 2602 } | |
| 2603 } | |
| 2604 elsif ($methylation_calls[$index] eq 'Z') { | |
| 2605 $counting{total_meCpG_count}++; | |
| 2606 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2607 if ($read_identity == 1){ | |
| 2608 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 2609 } | |
| 2610 else{ | |
| 2611 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 2612 } | |
| 2613 } | |
| 2614 elsif ($methylation_calls[$index] eq 'z') { | |
| 2615 $counting{total_unmethylated_CpG_count}++; | |
| 2616 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2617 if ($read_identity == 1){ | |
| 2618 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 2619 } | |
| 2620 else{ | |
| 2621 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 2622 } | |
| 2623 } | |
| 2624 elsif ($methylation_calls[$index] eq 'H') { | |
| 2625 $counting{total_meCHH_count}++; | |
| 2626 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2627 if ($read_identity == 1){ | |
| 2628 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 2629 } | |
| 2630 else{ | |
| 2631 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 2632 } | |
| 2633 } | |
| 2634 elsif ($methylation_calls[$index] eq 'h') { | |
| 2635 $counting{total_unmethylated_CHH_count}++; | |
| 2636 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2637 if ($read_identity == 1){ | |
| 2638 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 2639 } | |
| 2640 else{ | |
| 2641 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 2642 } | |
| 2643 } | |
| 2644 elsif ($methylation_calls[$index] eq '.'){} | |
| 2645 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 2646 else{ | |
| 2647 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
| 2648 } | |
| 2649 } | |
| 2650 } | |
| 2651 elsif ($strand eq '-') { | |
| 2652 for my $index (0..$#methylation_calls) { | |
| 2653 | |
| 2654 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 2655 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
| 2656 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 2657 $cigar_offset += $cigar_mod; | |
| 2658 $pos_offset += $pos_mod; | |
| 2659 } | |
| 2660 | |
| 2661 ### Returning as soon as the methylation calls start overlapping | |
| 2662 if ($start-$index+$pos_offset <= $end_read_1) { | |
| 2663 return; | |
| 2664 } | |
| 2665 | |
| 2666 if ($methylation_calls[$index] eq 'X') { | |
| 2667 $counting{total_meCHG_count}++; | |
| 2668 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2669 if ($read_identity == 1){ | |
| 2670 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 2671 } | |
| 2672 else{ | |
| 2673 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 2674 } | |
| 2675 } | |
| 2676 elsif ($methylation_calls[$index] eq 'x') { | |
| 2677 $counting{total_unmethylated_CHG_count}++; | |
| 2678 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2679 if ($read_identity == 1){ | |
| 2680 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 2681 } | |
| 2682 else{ | |
| 2683 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 2684 } | |
| 2685 } | |
| 2686 elsif ($methylation_calls[$index] eq 'Z') { | |
| 2687 $counting{total_meCpG_count}++; | |
| 2688 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2689 if ($read_identity == 1){ | |
| 2690 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 2691 } | |
| 2692 else{ | |
| 2693 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 2694 } | |
| 2695 } | |
| 2696 elsif ($methylation_calls[$index] eq 'z') { | |
| 2697 $counting{total_unmethylated_CpG_count}++; | |
| 2698 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2699 if ($read_identity == 1){ | |
| 2700 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 2701 } | |
| 2702 else{ | |
| 2703 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 2704 } | |
| 2705 } | |
| 2706 elsif ($methylation_calls[$index] eq 'H') { | |
| 2707 $counting{total_meCHH_count}++; | |
| 2708 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2709 if ($read_identity == 1){ | |
| 2710 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 2711 } | |
| 2712 else{ | |
| 2713 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 2714 } | |
| 2715 } | |
| 2716 elsif ($methylation_calls[$index] eq 'h') { | |
| 2717 $counting{total_unmethylated_CHH_count}++; | |
| 2718 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2719 if ($read_identity == 1){ | |
| 2720 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 2721 } | |
| 2722 else{ | |
| 2723 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 2724 } | |
| 2725 } | |
| 2726 elsif ($methylation_calls[$index] eq '.') {} | |
| 2727 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 2728 else{ | |
| 2729 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
| 2730 } | |
| 2731 } | |
| 2732 } else { | |
| 2733 die "The read orientation was neither + nor -: '$strand'\n"; | |
| 2734 } | |
| 2735 } | |
| 2736 | |
| 2737 ### strand-specific methylation output | |
| 2738 else { | |
| 2739 if ($strand eq '+') { | |
| 2740 for my $index (0..$#methylation_calls) { | |
| 2741 | |
| 2742 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 2743 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 2744 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
| 2745 $cigar_offset += $cigar_mod; | |
| 2746 $pos_offset += $pos_mod; | |
| 2747 } | |
| 2748 | |
| 2749 ### Returning as soon as the methylation calls start overlapping | |
| 2750 if ($start+$index+$pos_offset >= $end_read_1) { | |
| 2751 return; | |
| 2752 } | |
| 2753 | |
| 2754 if ($methylation_calls[$index] eq 'X') { | |
| 2755 $counting{total_meCHG_count}++; | |
| 2756 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2757 if ($read_identity == 1){ | |
| 2758 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 2759 } | |
| 2760 else{ | |
| 2761 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 2762 } | |
| 2763 } | |
| 2764 elsif ($methylation_calls[$index] eq 'x') { | |
| 2765 $counting{total_unmethylated_CHG_count}++; | |
| 2766 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2767 if ($read_identity == 1){ | |
| 2768 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 2769 } | |
| 2770 else{ | |
| 2771 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 2772 } | |
| 2773 } | |
| 2774 elsif ($methylation_calls[$index] eq 'Z') { | |
| 2775 $counting{total_meCpG_count}++; | |
| 2776 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2777 if ($read_identity == 1){ | |
| 2778 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 2779 } | |
| 2780 else{ | |
| 2781 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 2782 } | |
| 2783 } | |
| 2784 elsif ($methylation_calls[$index] eq 'z') { | |
| 2785 $counting{total_unmethylated_CpG_count}++; | |
| 2786 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2787 if ($read_identity == 1){ | |
| 2788 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 2789 } | |
| 2790 else{ | |
| 2791 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 2792 } | |
| 2793 } | |
| 2794 elsif ($methylation_calls[$index] eq 'H') { | |
| 2795 $counting{total_meCHH_count}++; | |
| 2796 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2797 if ($read_identity == 1){ | |
| 2798 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 2799 } | |
| 2800 else{ | |
| 2801 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 2802 } | |
| 2803 } | |
| 2804 elsif ($methylation_calls[$index] eq 'h') { | |
| 2805 $counting{total_unmethylated_CHH_count}++; | |
| 2806 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2807 if ($read_identity == 1){ | |
| 2808 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 2809 } | |
| 2810 else{ | |
| 2811 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 2812 } | |
| 2813 } | |
| 2814 elsif ($methylation_calls[$index] eq '.') {} | |
| 2815 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 2816 else{ | |
| 2817 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 2818 } | |
| 2819 } | |
| 2820 } elsif ($strand eq '-') { | |
| 2821 for my $index (0..$#methylation_calls) { | |
| 2822 | |
| 2823 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 2824 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
| 2825 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 2826 $cigar_offset += $cigar_mod; | |
| 2827 $pos_offset += $pos_mod; | |
| 2828 } | |
| 2829 | |
| 2830 ### Returning as soon as the methylation calls start overlapping | |
| 2831 if ($start-$index+$pos_offset <= $end_read_1) { | |
| 2832 return; | |
| 2833 } | |
| 2834 | |
| 2835 if ($methylation_calls[$index] eq 'X') { | |
| 2836 $counting{total_meCHG_count}++; | |
| 2837 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2838 if ($read_identity == 1){ | |
| 2839 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 2840 } | |
| 2841 else{ | |
| 2842 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 2843 } | |
| 2844 } | |
| 2845 elsif ($methylation_calls[$index] eq 'x') { | |
| 2846 $counting{total_unmethylated_CHG_count}++; | |
| 2847 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2848 if ($read_identity == 1){ | |
| 2849 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 2850 } | |
| 2851 else{ | |
| 2852 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 2853 } | |
| 2854 } | |
| 2855 elsif ($methylation_calls[$index] eq 'Z') { | |
| 2856 $counting{total_meCpG_count}++; | |
| 2857 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2858 if ($read_identity == 1){ | |
| 2859 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 2860 } | |
| 2861 else{ | |
| 2862 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 2863 } | |
| 2864 } | |
| 2865 elsif ($methylation_calls[$index] eq 'z') { | |
| 2866 $counting{total_unmethylated_CpG_count}++; | |
| 2867 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2868 if ($read_identity == 1){ | |
| 2869 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 2870 } | |
| 2871 else{ | |
| 2872 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 2873 } | |
| 2874 } | |
| 2875 elsif ($methylation_calls[$index] eq 'H') { | |
| 2876 $counting{total_meCHH_count}++; | |
| 2877 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2878 if ($read_identity == 1){ | |
| 2879 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 2880 } | |
| 2881 else{ | |
| 2882 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 2883 } | |
| 2884 } | |
| 2885 elsif ($methylation_calls[$index] eq 'h') { | |
| 2886 $counting{total_unmethylated_CHH_count}++; | |
| 2887 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2888 if ($read_identity == 1){ | |
| 2889 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 2890 } | |
| 2891 else{ | |
| 2892 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 2893 } | |
| 2894 } | |
| 2895 elsif ($methylation_calls[$index] eq '.') {} | |
| 2896 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 2897 else{ | |
| 2898 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 2899 } | |
| 2900 } | |
| 2901 } else { | |
| 2902 die "The strand orientation was neither + nor -: '$strand'/n"; | |
| 2903 } | |
| 2904 } | |
| 2905 } | |
| 2906 | |
| 2907 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
| 2908 ### Still within the 2-CONTEXT ONLY optional section | |
| 2909 else { | |
| 2910 ### single-file CpG and non-CpG context output | |
| 2911 if ($full) { | |
| 2912 if ($strand eq '+') { | |
| 2913 for my $index (0..$#methylation_calls) { | |
| 2914 | |
| 2915 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 2916 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 2917 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
| 2918 $cigar_offset += $cigar_mod; | |
| 2919 $pos_offset += $pos_mod; | |
| 2920 } | |
| 2921 | |
| 2922 if ($methylation_calls[$index] eq 'X') { | |
| 2923 $counting{total_meCHG_count}++; | |
| 2924 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2925 if ($read_identity == 1){ | |
| 2926 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 2927 } | |
| 2928 else{ | |
| 2929 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 2930 } | |
| 2931 } | |
| 2932 elsif ($methylation_calls[$index] eq 'x') { | |
| 2933 $counting{total_unmethylated_CHG_count}++; | |
| 2934 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2935 if ($read_identity == 1){ | |
| 2936 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 2937 } | |
| 2938 else{ | |
| 2939 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 2940 } | |
| 2941 } | |
| 2942 elsif ($methylation_calls[$index] eq 'Z') { | |
| 2943 $counting{total_meCpG_count}++; | |
| 2944 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2945 if ($read_identity == 1){ | |
| 2946 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 2947 } | |
| 2948 else{ | |
| 2949 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 2950 } | |
| 2951 } | |
| 2952 elsif ($methylation_calls[$index] eq 'z') { | |
| 2953 $counting{total_unmethylated_CpG_count}++; | |
| 2954 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2955 if ($read_identity == 1){ | |
| 2956 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 2957 } | |
| 2958 else{ | |
| 2959 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 2960 } | |
| 2961 } | |
| 2962 elsif ($methylation_calls[$index] eq 'H') { | |
| 2963 $counting{total_meCHH_count}++; | |
| 2964 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2965 if ($read_identity == 1){ | |
| 2966 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 2967 } | |
| 2968 else{ | |
| 2969 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 2970 } | |
| 2971 } | |
| 2972 elsif ($methylation_calls[$index] eq 'h') { | |
| 2973 $counting{total_unmethylated_CHH_count}++; | |
| 2974 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 2975 if ($read_identity == 1){ | |
| 2976 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 2977 } | |
| 2978 else{ | |
| 2979 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 2980 } | |
| 2981 } | |
| 2982 elsif ($methylation_calls[$index] eq '.') {} | |
| 2983 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 2984 else{ | |
| 2985 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
| 2986 } | |
| 2987 } | |
| 2988 } elsif ($strand eq '-') { | |
| 2989 for my $index (0..$#methylation_calls) { | |
| 2990 | |
| 2991 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 2992 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
| 2993 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 2994 $cigar_offset += $cigar_mod; | |
| 2995 $pos_offset += $pos_mod; | |
| 2996 } | |
| 2997 | |
| 2998 if ($methylation_calls[$index] eq 'X') { | |
| 2999 $counting{total_meCHG_count}++; | |
| 3000 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3001 if ($read_identity == 1){ | |
| 3002 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 3003 } | |
| 3004 else{ | |
| 3005 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 3006 } | |
| 3007 } | |
| 3008 elsif ($methylation_calls[$index] eq 'x') { | |
| 3009 $counting{total_unmethylated_CHG_count}++; | |
| 3010 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3011 if ($read_identity == 1){ | |
| 3012 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 3013 } | |
| 3014 else{ | |
| 3015 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 3016 } | |
| 3017 } | |
| 3018 elsif ($methylation_calls[$index] eq 'Z') { | |
| 3019 $counting{total_meCpG_count}++; | |
| 3020 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3021 if ($read_identity == 1){ | |
| 3022 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 3023 } | |
| 3024 else{ | |
| 3025 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 3026 } | |
| 3027 } | |
| 3028 elsif ($methylation_calls[$index] eq 'z') { | |
| 3029 $counting{total_unmethylated_CpG_count}++; | |
| 3030 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3031 if ($read_identity == 1){ | |
| 3032 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 3033 } | |
| 3034 else{ | |
| 3035 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 3036 } | |
| 3037 } | |
| 3038 elsif ($methylation_calls[$index] eq 'H') { | |
| 3039 $counting{total_meCHH_count}++; | |
| 3040 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3041 if ($read_identity == 1){ | |
| 3042 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 3043 } | |
| 3044 else{ | |
| 3045 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 3046 } | |
| 3047 } | |
| 3048 elsif ($methylation_calls[$index] eq 'h') { | |
| 3049 $counting{total_unmethylated_CHH_count}++; | |
| 3050 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3051 if ($read_identity == 1){ | |
| 3052 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 3053 } | |
| 3054 else{ | |
| 3055 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 3056 } | |
| 3057 } | |
| 3058 elsif ($methylation_calls[$index] eq '.') {} | |
| 3059 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 3060 else{ | |
| 3061 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
| 3062 } | |
| 3063 } | |
| 3064 } else { | |
| 3065 die "The strand orientation as neither + nor -: '$strand'\n"; | |
| 3066 } | |
| 3067 } | |
| 3068 | |
| 3069 ### strand-specific methylation output | |
| 3070 ### still within the 2-CONTEXT optional section | |
| 3071 else { | |
| 3072 if ($strand eq '+') { | |
| 3073 for my $index (0..$#methylation_calls) { | |
| 3074 | |
| 3075 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 3076 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 3077 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
| 3078 $cigar_offset += $cigar_mod; | |
| 3079 $pos_offset += $pos_mod; | |
| 3080 } | |
| 3081 | |
| 3082 if ($methylation_calls[$index] eq 'X') { | |
| 3083 $counting{total_meCHG_count}++; | |
| 3084 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3085 if ($read_identity == 1){ | |
| 3086 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 3087 } | |
| 3088 else{ | |
| 3089 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 3090 } | |
| 3091 } | |
| 3092 elsif ($methylation_calls[$index] eq 'x') { | |
| 3093 $counting{total_unmethylated_CHG_count}++; | |
| 3094 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3095 if ($read_identity == 1){ | |
| 3096 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 3097 } | |
| 3098 else{ | |
| 3099 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 3100 } | |
| 3101 } | |
| 3102 elsif ($methylation_calls[$index] eq 'Z') { | |
| 3103 $counting{total_meCpG_count}++; | |
| 3104 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3105 if ($read_identity == 1){ | |
| 3106 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 3107 } | |
| 3108 else{ | |
| 3109 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 3110 } | |
| 3111 } | |
| 3112 elsif ($methylation_calls[$index] eq 'z') { | |
| 3113 $counting{total_unmethylated_CpG_count}++; | |
| 3114 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3115 if ($read_identity == 1){ | |
| 3116 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 3117 } | |
| 3118 else{ | |
| 3119 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 3120 } | |
| 3121 } | |
| 3122 elsif ($methylation_calls[$index] eq 'H') { | |
| 3123 $counting{total_meCHH_count}++; | |
| 3124 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3125 if ($read_identity == 1){ | |
| 3126 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 3127 } | |
| 3128 else{ | |
| 3129 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 3130 } | |
| 3131 } | |
| 3132 elsif ($methylation_calls[$index] eq 'h') { | |
| 3133 $counting{total_unmethylated_CHH_count}++; | |
| 3134 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3135 if ($read_identity == 1){ | |
| 3136 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 3137 } | |
| 3138 else{ | |
| 3139 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 3140 } | |
| 3141 } | |
| 3142 elsif ($methylation_calls[$index] eq '.') {} | |
| 3143 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 3144 else{ | |
| 3145 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 3146 } | |
| 3147 } | |
| 3148 } elsif ($strand eq '-') { | |
| 3149 for my $index (0..$#methylation_calls) { | |
| 3150 | |
| 3151 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 3152 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
| 3153 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 3154 $cigar_offset += $cigar_mod; | |
| 3155 $pos_offset += $pos_mod; | |
| 3156 } | |
| 3157 | |
| 3158 if ($methylation_calls[$index] eq 'X') { | |
| 3159 $counting{total_meCHG_count}++; | |
| 3160 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3161 if ($read_identity == 1){ | |
| 3162 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 3163 } | |
| 3164 else{ | |
| 3165 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 3166 } | |
| 3167 } | |
| 3168 elsif ($methylation_calls[$index] eq 'x') { | |
| 3169 $counting{total_unmethylated_CHG_count}++; | |
| 3170 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3171 if ($read_identity == 1){ | |
| 3172 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 3173 } | |
| 3174 else{ | |
| 3175 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 3176 } | |
| 3177 } | |
| 3178 elsif ($methylation_calls[$index] eq 'Z') { | |
| 3179 $counting{total_meCpG_count}++; | |
| 3180 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3181 if ($read_identity == 1){ | |
| 3182 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 3183 } | |
| 3184 else{ | |
| 3185 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 3186 } | |
| 3187 } | |
| 3188 elsif ($methylation_calls[$index] eq 'z') { | |
| 3189 $counting{total_unmethylated_CpG_count}++; | |
| 3190 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3191 if ($read_identity == 1){ | |
| 3192 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 3193 } | |
| 3194 else{ | |
| 3195 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 3196 } | |
| 3197 } | |
| 3198 elsif ($methylation_calls[$index] eq 'H') { | |
| 3199 $counting{total_meCHH_count}++; | |
| 3200 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3201 if ($read_identity == 1){ | |
| 3202 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 3203 } | |
| 3204 else{ | |
| 3205 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 3206 } | |
| 3207 } | |
| 3208 elsif ($methylation_calls[$index] eq 'h') { | |
| 3209 $counting{total_unmethylated_CHH_count}++; | |
| 3210 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3211 if ($read_identity == 1){ | |
| 3212 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 3213 } | |
| 3214 else{ | |
| 3215 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 3216 } | |
| 3217 } | |
| 3218 elsif ($methylation_calls[$index] eq '.') {} | |
| 3219 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 3220 else{ | |
| 3221 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 3222 } | |
| 3223 } | |
| 3224 } else { | |
| 3225 die "The strand orientation as neither + nor -: '$strand'\n"; | |
| 3226 } | |
| 3227 } | |
| 3228 } | |
| 3229 } | |
| 3230 | |
| 3231 ############################################ | |
| 3232 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### | |
| 3233 ############################################ | |
| 3234 | |
| 3235 elsif ($no_overlap) { | |
| 3236 ### single-file CpG, CHG and CHH context output | |
| 3237 if ($full) { | |
| 3238 if ($strand eq '+') { | |
| 3239 for my $index (0..$#methylation_calls) { | |
| 3240 | |
| 3241 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 3242 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 3243 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
| 3244 $cigar_offset += $cigar_mod; | |
| 3245 $pos_offset += $pos_mod; | |
| 3246 } | |
| 3247 | |
| 3248 ### Returning as soon as the methylation calls start overlapping | |
| 3249 if ($start+$index+$pos_offset >= $end_read_1) { | |
| 3250 return; | |
| 3251 } | |
| 3252 | |
| 3253 if ($methylation_calls[$index] eq 'X') { | |
| 3254 $counting{total_meCHG_count}++; | |
| 3255 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3256 if ($read_identity == 1){ | |
| 3257 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 3258 } | |
| 3259 else{ | |
| 3260 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 3261 } | |
| 3262 } | |
| 3263 elsif ($methylation_calls[$index] eq 'x') { | |
| 3264 $counting{total_unmethylated_CHG_count}++; | |
| 3265 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3266 if ($read_identity == 1){ | |
| 3267 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 3268 } | |
| 3269 else{ | |
| 3270 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 3271 } | |
| 3272 } | |
| 3273 elsif ($methylation_calls[$index] eq 'Z') { | |
| 3274 $counting{total_meCpG_count}++; | |
| 3275 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3276 if ($read_identity == 1){ | |
| 3277 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 3278 } | |
| 3279 else{ | |
| 3280 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 3281 } | |
| 3282 } | |
| 3283 elsif ($methylation_calls[$index] eq 'z') { | |
| 3284 $counting{total_unmethylated_CpG_count}++; | |
| 3285 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3286 if ($read_identity == 1){ | |
| 3287 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 3288 } | |
| 3289 else{ | |
| 3290 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 3291 } | |
| 3292 } | |
| 3293 elsif ($methylation_calls[$index] eq 'H') { | |
| 3294 $counting{total_meCHH_count}++; | |
| 3295 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3296 if ($read_identity == 1){ | |
| 3297 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 3298 } | |
| 3299 else{ | |
| 3300 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 3301 } | |
| 3302 } | |
| 3303 elsif ($methylation_calls[$index] eq 'h') { | |
| 3304 $counting{total_unmethylated_CHH_count}++; | |
| 3305 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3306 if ($read_identity == 1){ | |
| 3307 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 3308 } | |
| 3309 else{ | |
| 3310 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 3311 } | |
| 3312 } | |
| 3313 elsif ($methylation_calls[$index] eq '.') {} | |
| 3314 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 3315 else{ | |
| 3316 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 3317 } | |
| 3318 } | |
| 3319 } elsif ($strand eq '-') { | |
| 3320 for my $index (0..$#methylation_calls) { | |
| 3321 | |
| 3322 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 3323 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
| 3324 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 3325 $cigar_offset += $cigar_mod; | |
| 3326 $pos_offset += $pos_mod; | |
| 3327 } | |
| 3328 | |
| 3329 ### Returning as soon as the methylation calls start overlapping | |
| 3330 if ($start-$index+$pos_offset <= $end_read_1) { | |
| 3331 return; | |
| 3332 } | |
| 3333 | |
| 3334 if ($methylation_calls[$index] eq 'X') { | |
| 3335 $counting{total_meCHG_count}++; | |
| 3336 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3337 if ($read_identity == 1){ | |
| 3338 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 3339 } | |
| 3340 else{ | |
| 3341 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 3342 } | |
| 3343 } | |
| 3344 elsif ($methylation_calls[$index] eq 'x') { | |
| 3345 $counting{total_unmethylated_CHG_count}++; | |
| 3346 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3347 if ($read_identity == 1){ | |
| 3348 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 3349 } | |
| 3350 else{ | |
| 3351 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 3352 } | |
| 3353 } | |
| 3354 elsif ($methylation_calls[$index] eq 'Z') { | |
| 3355 $counting{total_meCpG_count}++; | |
| 3356 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3357 if ($read_identity == 1){ | |
| 3358 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 3359 } | |
| 3360 else{ | |
| 3361 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 3362 } | |
| 3363 } | |
| 3364 elsif ($methylation_calls[$index] eq 'z') { | |
| 3365 $counting{total_unmethylated_CpG_count}++; | |
| 3366 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3367 if ($read_identity == 1){ | |
| 3368 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 3369 } | |
| 3370 else{ | |
| 3371 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 3372 } | |
| 3373 } | |
| 3374 elsif ($methylation_calls[$index] eq 'H') { | |
| 3375 $counting{total_meCHH_count}++; | |
| 3376 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3377 if ($read_identity == 1){ | |
| 3378 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 3379 } | |
| 3380 else{ | |
| 3381 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 3382 } | |
| 3383 } | |
| 3384 elsif ($methylation_calls[$index] eq 'h') { | |
| 3385 $counting{total_unmethylated_CHH_count}++; | |
| 3386 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3387 if ($read_identity == 1){ | |
| 3388 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 3389 } | |
| 3390 else{ | |
| 3391 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 3392 } | |
| 3393 } | |
| 3394 elsif ($methylation_calls[$index] eq '.') {} | |
| 3395 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 3396 else{ | |
| 3397 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 3398 } | |
| 3399 } | |
| 3400 } else { | |
| 3401 die "The strand orientation as neither + nor -: '$strand'\n"; | |
| 3402 } | |
| 3403 } | |
| 3404 | |
| 3405 ### strand-specific methylation output | |
| 3406 else { | |
| 3407 if ($strand eq '+') { | |
| 3408 for my $index (0..$#methylation_calls) { | |
| 3409 | |
| 3410 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 3411 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 3412 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
| 3413 $cigar_offset += $cigar_mod; | |
| 3414 $pos_offset += $pos_mod; | |
| 3415 } | |
| 3416 | |
| 3417 ### Returning as soon as the methylation calls start overlapping | |
| 3418 if ($start+$index+$pos_offset >= $end_read_1) { | |
| 3419 return; | |
| 3420 } | |
| 3421 | |
| 3422 if ($methylation_calls[$index] eq 'X') { | |
| 3423 $counting{total_meCHG_count}++; | |
| 3424 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3425 if ($read_identity == 1){ | |
| 3426 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 3427 } | |
| 3428 else{ | |
| 3429 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 3430 } | |
| 3431 } | |
| 3432 elsif ($methylation_calls[$index] eq 'x') { | |
| 3433 $counting{total_unmethylated_CHG_count}++; | |
| 3434 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3435 if ($read_identity == 1){ | |
| 3436 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 3437 } | |
| 3438 else{ | |
| 3439 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 3440 } | |
| 3441 } | |
| 3442 elsif ($methylation_calls[$index] eq 'Z') { | |
| 3443 $counting{total_meCpG_count}++; | |
| 3444 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3445 if ($read_identity == 1){ | |
| 3446 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 3447 } | |
| 3448 else{ | |
| 3449 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 3450 } | |
| 3451 } | |
| 3452 elsif ($methylation_calls[$index] eq 'z') { | |
| 3453 $counting{total_unmethylated_CpG_count}++; | |
| 3454 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3455 if ($read_identity == 1){ | |
| 3456 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 3457 } | |
| 3458 else{ | |
| 3459 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 3460 } | |
| 3461 } | |
| 3462 elsif ($methylation_calls[$index] eq 'H') { | |
| 3463 $counting{total_meCHH_count}++; | |
| 3464 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3465 if ($read_identity == 1){ | |
| 3466 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 3467 } | |
| 3468 else{ | |
| 3469 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 3470 } | |
| 3471 } | |
| 3472 elsif ($methylation_calls[$index] eq 'h') { | |
| 3473 $counting{total_unmethylated_CHH_count}++; | |
| 3474 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3475 if ($read_identity == 1){ | |
| 3476 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 3477 } | |
| 3478 else{ | |
| 3479 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 3480 } | |
| 3481 } | |
| 3482 elsif ($methylation_calls[$index] eq '.') {} | |
| 3483 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 3484 else{ | |
| 3485 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 3486 } | |
| 3487 } | |
| 3488 } elsif ($strand eq '-') { | |
| 3489 for my $index (0..$#methylation_calls) { | |
| 3490 | |
| 3491 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 3492 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
| 3493 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 3494 $cigar_offset += $cigar_mod; | |
| 3495 $pos_offset += $pos_mod; | |
| 3496 } | |
| 3497 | |
| 3498 ### Returning as soon as the methylation calls start overlapping | |
| 3499 if ($start-$index+$pos_offset <= $end_read_1) { | |
| 3500 return; | |
| 3501 } | |
| 3502 | |
| 3503 if ($methylation_calls[$index] eq 'X') { | |
| 3504 $counting{total_meCHG_count}++; | |
| 3505 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3506 if ($read_identity == 1){ | |
| 3507 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 3508 } | |
| 3509 else{ | |
| 3510 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 3511 } | |
| 3512 } | |
| 3513 elsif ($methylation_calls[$index] eq 'x') { | |
| 3514 $counting{total_unmethylated_CHG_count}++; | |
| 3515 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3516 if ($read_identity == 1){ | |
| 3517 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 3518 } | |
| 3519 else{ | |
| 3520 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 3521 } | |
| 3522 } | |
| 3523 elsif ($methylation_calls[$index] eq 'Z') { | |
| 3524 $counting{total_meCpG_count}++; | |
| 3525 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3526 if ($read_identity == 1){ | |
| 3527 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 3528 } | |
| 3529 else{ | |
| 3530 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 3531 } | |
| 3532 } | |
| 3533 elsif ($methylation_calls[$index] eq 'z') { | |
| 3534 $counting{total_unmethylated_CpG_count}++; | |
| 3535 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3536 if ($read_identity == 1){ | |
| 3537 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 3538 } | |
| 3539 else{ | |
| 3540 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 3541 } | |
| 3542 } | |
| 3543 elsif ($methylation_calls[$index] eq 'H') { | |
| 3544 $counting{total_meCHH_count}++; | |
| 3545 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3546 if ($read_identity == 1){ | |
| 3547 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 3548 } | |
| 3549 else{ | |
| 3550 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 3551 } | |
| 3552 } | |
| 3553 elsif ($methylation_calls[$index] eq 'h') { | |
| 3554 $counting{total_unmethylated_CHH_count}++; | |
| 3555 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3556 if ($read_identity == 1){ | |
| 3557 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 3558 } | |
| 3559 else{ | |
| 3560 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 3561 } | |
| 3562 } | |
| 3563 elsif ($methylation_calls[$index] eq '.') {} | |
| 3564 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 3565 else{ | |
| 3566 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 3567 } | |
| 3568 } | |
| 3569 } else { | |
| 3570 die "The strand orientation as neither + nor -: '$strand'\n"; | |
| 3571 } | |
| 3572 } | |
| 3573 } | |
| 3574 | |
| 3575 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
| 3576 else { | |
| 3577 ### single-file CpG, CHG and CHH context output | |
| 3578 if ($full) { | |
| 3579 if ($strand eq '+') { | |
| 3580 for my $index (0..$#methylation_calls) { | |
| 3581 | |
| 3582 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 3583 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 3584 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
| 3585 $cigar_offset += $cigar_mod; | |
| 3586 $pos_offset += $pos_mod; | |
| 3587 } | |
| 3588 | |
| 3589 if ($methylation_calls[$index] eq 'X') { | |
| 3590 $counting{total_meCHG_count}++; | |
| 3591 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3592 if ($read_identity == 1){ | |
| 3593 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 3594 } | |
| 3595 else{ | |
| 3596 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 3597 } | |
| 3598 } | |
| 3599 elsif ($methylation_calls[$index] eq 'x') { | |
| 3600 $counting{total_unmethylated_CHG_count}++; | |
| 3601 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3602 if ($read_identity == 1){ | |
| 3603 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 3604 } | |
| 3605 else{ | |
| 3606 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 3607 } | |
| 3608 } | |
| 3609 elsif ($methylation_calls[$index] eq 'Z') { | |
| 3610 $counting{total_meCpG_count}++; | |
| 3611 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3612 if ($read_identity == 1){ | |
| 3613 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 3614 } | |
| 3615 else{ | |
| 3616 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 3617 } | |
| 3618 } | |
| 3619 elsif ($methylation_calls[$index] eq 'z') { | |
| 3620 $counting{total_unmethylated_CpG_count}++; | |
| 3621 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3622 if ($read_identity == 1){ | |
| 3623 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 3624 } | |
| 3625 else{ | |
| 3626 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 3627 } | |
| 3628 } | |
| 3629 elsif ($methylation_calls[$index] eq 'H') { | |
| 3630 $counting{total_meCHH_count}++; | |
| 3631 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3632 if ($read_identity == 1){ | |
| 3633 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 3634 } | |
| 3635 else{ | |
| 3636 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 3637 } | |
| 3638 } | |
| 3639 elsif ($methylation_calls[$index] eq 'h') { | |
| 3640 $counting{total_unmethylated_CHH_count}++; | |
| 3641 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3642 if ($read_identity == 1){ | |
| 3643 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 3644 } | |
| 3645 else{ | |
| 3646 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 3647 } | |
| 3648 } | |
| 3649 elsif ($methylation_calls[$index] eq '.') {} | |
| 3650 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 3651 else{ | |
| 3652 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 3653 } | |
| 3654 } | |
| 3655 } elsif ($strand eq '-') { | |
| 3656 for my $index (0..$#methylation_calls) { | |
| 3657 | |
| 3658 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 3659 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
| 3660 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 3661 $cigar_offset += $cigar_mod; | |
| 3662 $pos_offset += $pos_mod; | |
| 3663 } | |
| 3664 | |
| 3665 if ($methylation_calls[$index] eq 'X') { | |
| 3666 $counting{total_meCHG_count}++; | |
| 3667 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3668 if ($read_identity == 1){ | |
| 3669 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 3670 } | |
| 3671 else{ | |
| 3672 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 3673 } | |
| 3674 } | |
| 3675 elsif ($methylation_calls[$index] eq 'x') { | |
| 3676 $counting{total_unmethylated_CHG_count}++; | |
| 3677 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3678 if ($read_identity == 1){ | |
| 3679 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 3680 } | |
| 3681 else{ | |
| 3682 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 3683 } | |
| 3684 } | |
| 3685 elsif ($methylation_calls[$index] eq 'Z') { | |
| 3686 $counting{total_meCpG_count}++; | |
| 3687 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3688 if ($read_identity == 1){ | |
| 3689 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 3690 } | |
| 3691 else{ | |
| 3692 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 3693 } | |
| 3694 } | |
| 3695 elsif ($methylation_calls[$index] eq 'z') { | |
| 3696 $counting{total_unmethylated_CpG_count}++; | |
| 3697 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3698 if ($read_identity == 1){ | |
| 3699 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 3700 } | |
| 3701 else{ | |
| 3702 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 3703 } | |
| 3704 } | |
| 3705 elsif ($methylation_calls[$index] eq 'H') { | |
| 3706 $counting{total_meCHH_count}++; | |
| 3707 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3708 if ($read_identity == 1){ | |
| 3709 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 3710 } | |
| 3711 else{ | |
| 3712 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 3713 } | |
| 3714 } | |
| 3715 elsif ($methylation_calls[$index] eq 'h') { | |
| 3716 $counting{total_unmethylated_CHH_count}++; | |
| 3717 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3718 if ($read_identity == 1){ | |
| 3719 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 3720 } | |
| 3721 else{ | |
| 3722 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 3723 } | |
| 3724 } | |
| 3725 elsif ($methylation_calls[$index] eq '.') {} | |
| 3726 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 3727 else{ | |
| 3728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 3729 } | |
| 3730 } | |
| 3731 } else { | |
| 3732 die "The strand orientation as neither + nor -: '$strand'\n"; | |
| 3733 } | |
| 3734 } | |
| 3735 | |
| 3736 ### strand-specific methylation output | |
| 3737 else { | |
| 3738 if ($strand eq '+') { | |
| 3739 for my $index (0..$#methylation_calls) { | |
| 3740 | |
| 3741 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 3742 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 3743 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
| 3744 $cigar_offset += $cigar_mod; | |
| 3745 $pos_offset += $pos_mod; | |
| 3746 } | |
| 3747 | |
| 3748 if ($methylation_calls[$index] eq 'X') { | |
| 3749 $counting{total_meCHG_count}++; | |
| 3750 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3751 if ($read_identity == 1){ | |
| 3752 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 3753 } | |
| 3754 else{ | |
| 3755 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 3756 } | |
| 3757 } | |
| 3758 elsif ($methylation_calls[$index] eq 'x') { | |
| 3759 $counting{total_unmethylated_CHG_count}++; | |
| 3760 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3761 if ($read_identity == 1){ | |
| 3762 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 3763 } | |
| 3764 else{ | |
| 3765 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 3766 } | |
| 3767 } | |
| 3768 elsif ($methylation_calls[$index] eq 'Z') { | |
| 3769 $counting{total_meCpG_count}++; | |
| 3770 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3771 if ($read_identity == 1){ | |
| 3772 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 3773 } | |
| 3774 else{ | |
| 3775 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 3776 } | |
| 3777 } | |
| 3778 elsif ($methylation_calls[$index] eq 'z') { | |
| 3779 $counting{total_unmethylated_CpG_count}++; | |
| 3780 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3781 if ($read_identity == 1){ | |
| 3782 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 3783 } | |
| 3784 else{ | |
| 3785 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 3786 } | |
| 3787 } | |
| 3788 elsif ($methylation_calls[$index] eq 'H') { | |
| 3789 $counting{total_meCHH_count}++; | |
| 3790 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3791 if ($read_identity == 1){ | |
| 3792 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 3793 } | |
| 3794 else{ | |
| 3795 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 3796 } | |
| 3797 } | |
| 3798 elsif ($methylation_calls[$index] eq 'h') { | |
| 3799 $counting{total_unmethylated_CHH_count}++; | |
| 3800 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3801 if ($read_identity == 1){ | |
| 3802 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 3803 } | |
| 3804 else{ | |
| 3805 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 3806 } | |
| 3807 } | |
| 3808 elsif ($methylation_calls[$index] eq '.') {} | |
| 3809 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 3810 else{ | |
| 3811 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 3812 } | |
| 3813 } | |
| 3814 } elsif ($strand eq '-') { | |
| 3815 for my $index (0..$#methylation_calls) { | |
| 3816 | |
| 3817 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
| 3818 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
| 3819 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 3820 $cigar_offset += $cigar_mod; | |
| 3821 $pos_offset += $pos_mod; | |
| 3822 } | |
| 3823 | |
| 3824 if ($methylation_calls[$index] eq 'X') { | |
| 3825 $counting{total_meCHG_count}++; | |
| 3826 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3827 if ($read_identity == 1){ | |
| 3828 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 3829 } | |
| 3830 else{ | |
| 3831 $mbias_2{CHG}->{$index+1}->{meth}++; | |
| 3832 } | |
| 3833 } | |
| 3834 elsif ($methylation_calls[$index] eq 'x') { | |
| 3835 $counting{total_unmethylated_CHG_count}++; | |
| 3836 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3837 if ($read_identity == 1){ | |
| 3838 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 3839 } | |
| 3840 else{ | |
| 3841 $mbias_2{CHG}->{$index+1}->{un}++; | |
| 3842 } | |
| 3843 } | |
| 3844 elsif ($methylation_calls[$index] eq 'Z') { | |
| 3845 $counting{total_meCpG_count}++; | |
| 3846 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3847 if ($read_identity == 1){ | |
| 3848 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 3849 } | |
| 3850 else{ | |
| 3851 $mbias_2{CpG}->{$index+1}->{meth}++; | |
| 3852 } | |
| 3853 } | |
| 3854 elsif ($methylation_calls[$index] eq 'z') { | |
| 3855 $counting{total_unmethylated_CpG_count}++; | |
| 3856 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3857 if ($read_identity == 1){ | |
| 3858 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 3859 } | |
| 3860 else{ | |
| 3861 $mbias_2{CpG}->{$index+1}->{un}++; | |
| 3862 } | |
| 3863 } | |
| 3864 elsif ($methylation_calls[$index] eq 'H') { | |
| 3865 $counting{total_meCHH_count}++; | |
| 3866 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3867 if ($read_identity == 1){ | |
| 3868 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 3869 } | |
| 3870 else{ | |
| 3871 $mbias_2{CHH}->{$index+1}->{meth}++; | |
| 3872 } | |
| 3873 } | |
| 3874 elsif ($methylation_calls[$index] eq 'h') { | |
| 3875 $counting{total_unmethylated_CHH_count}++; | |
| 3876 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 3877 if ($read_identity == 1){ | |
| 3878 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 3879 } | |
| 3880 else{ | |
| 3881 $mbias_2{CHH}->{$index+1}->{un}++; | |
| 3882 } | |
| 3883 } | |
| 3884 elsif ($methylation_calls[$index] eq '.') {} | |
| 3885 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 3886 else{ | |
| 3887 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 3888 } | |
| 3889 } | |
| 3890 } else { | |
| 3891 die "The strand orientation as neither + nor -: '$strand'\n"; | |
| 3892 } | |
| 3893 } | |
| 3894 } | |
| 3895 } | |
| 3896 | |
| 3897 sub check_cigar_string { | |
| 3898 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; | |
| 3899 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; | |
| 3900 my ($new_cigar_offset,$new_pos_offset) = (0,0); | |
| 3901 | |
| 3902 if ($strand eq '+') { | |
| 3903 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
| 3904 | |
| 3905 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
| 3906 # warn "position needs no adjustment\n"; | |
| 3907 } | |
| 3908 | |
| 3909 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
| 3910 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
| 3911 # warn "adjusted genomic position by -1 bp (insertion)\n"; | |
| 3912 } | |
| 3913 | |
| 3914 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
| 3915 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
| 3916 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
| 3917 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
| 3918 | |
| 3919 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
| 3920 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
| 3921 # warn "position needs no adjustment\n"; | |
| 3922 last; | |
| 3923 } | |
| 3924 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
| 3925 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
| 3926 # warn "adjusted genomic position by another -1 bp (insertion)\n"; | |
| 3927 last; | |
| 3928 } | |
| 3929 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
| 3930 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
| 3931 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
| 3932 # warn "adjusted genomic position by another +1 bp (deletion)\n"; | |
| 3933 } | |
| 3934 else{ | |
| 3935 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
| 3936 } | |
| 3937 } | |
| 3938 } | |
| 3939 else{ | |
| 3940 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
| 3941 } | |
| 3942 } | |
| 3943 | |
| 3944 elsif ($strand eq '-') { | |
| 3945 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
| 3946 | |
| 3947 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
| 3948 # warn "position needs no adjustment\n"; | |
| 3949 } | |
| 3950 | |
| 3951 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
| 3952 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position | |
| 3953 # warn "adjusted genomic position by +1 bp (insertion)\n"; | |
| 3954 } | |
| 3955 | |
| 3956 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
| 3957 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
| 3958 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
| 3959 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
| 3960 | |
| 3961 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
| 3962 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
| 3963 # warn "Found new 'M' operation; position needs no adjustment\n"; | |
| 3964 last; | |
| 3965 } | |
| 3966 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
| 3967 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position | |
| 3968 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; | |
| 3969 last; | |
| 3970 } | |
| 3971 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
| 3972 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
| 3973 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
| 3974 # warn "adjusted genomic position by another -1 bp (deletion)\n"; | |
| 3975 } | |
| 3976 else{ | |
| 3977 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
| 3978 } | |
| 3979 } | |
| 3980 } | |
| 3981 else{ | |
| 3982 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
| 3983 } | |
| 3984 } | |
| 3985 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; | |
| 3986 return ($new_cigar_offset,$new_pos_offset); | |
| 3987 } | |
| 3988 | |
| 3989 sub print_individual_C_methylation_states_single_end{ | |
| 3990 | |
| 3991 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; | |
| 3992 my @methylation_calls = split(//,$meth_call); | |
| 3993 | |
| 3994 ################################################################# | |
| 3995 ### . for bases not involving cytosines ### | |
| 3996 ### X for methylated C in CHG context (was protected) ### | |
| 3997 ### x for not methylated C in CHG context (was converted) ### | |
| 3998 ### H for methylated C in CHH context (was protected) ### | |
| 3999 ### h for not methylated C in CHH context (was converted) ### | |
| 4000 ### Z for methylated C in CpG context (was protected) ### | |
| 4001 ### z for not methylated C in CpG context (was converted) ### | |
| 4002 ################################################################# | |
| 4003 | |
| 4004 my $methyl_CHG_count = 0; | |
| 4005 my $methyl_CHH_count = 0; | |
| 4006 my $methyl_CpG_count = 0; | |
| 4007 my $unmethylated_CHG_count = 0; | |
| 4008 my $unmethylated_CHH_count = 0; | |
| 4009 my $unmethylated_CpG_count = 0; | |
| 4010 | |
| 4011 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | |
| 4012 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | |
| 4013 | |
| 4014 my @comp_cigar; | |
| 4015 | |
| 4016 if ($cigar){ # parsing CIGAR string | |
| 4017 | |
| 4018 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing | |
| 4019 if ($cigar =~ /^\d+M$/){ | |
| 4020 # warn "See!? I told you so! $cigar\n"; | |
| 4021 # sleep(1); | |
| 4022 } | |
| 4023 else{ | |
| 4024 | |
| 4025 my @len; | |
| 4026 my @ops; | |
| 4027 | |
| 4028 @len = split (/\D+/,$cigar); # storing the length per operation | |
| 4029 @ops = split (/\d+/,$cigar); # storing the operation | |
| 4030 shift @ops; # remove the empty first element | |
| 4031 # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops); | |
| 4032 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
| 4033 | |
| 4034 foreach my $index (0..$#len){ | |
| 4035 foreach (1..$len[$index]){ | |
| 4036 # print "$ops[$index]"; | |
| 4037 push @comp_cigar, $ops[$index]; | |
| 4038 } | |
| 4039 } | |
| 4040 } | |
| 4041 # warn "\nDetected CIGAR string: $cigar\n"; | |
| 4042 # warn "Length of methylation call: ",length $meth_call,"\n"; | |
| 4043 # warn "number of operations: ",scalar @ops,"\n"; | |
| 4044 # warn "number of length digits: ",scalar @len,"\n\n"; | |
| 4045 # print @comp_cigar,"\n"; | |
| 4046 # print "$meth_call\n\n"; | |
| 4047 # sleep (1); | |
| 4048 } | |
| 4049 | |
| 4050 ### adjusting the start position for all reads mapping to the reverse strand | |
| 4051 if ($strand eq '-') { | |
| 4052 | |
| 4053 if (@comp_cigar){ # only needed for SAM reads with InDels | |
| 4054 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
| 4055 # print @comp_cigar,"\n"; | |
| 4056 } | |
| 4057 | |
| 4058 unless ($ignore){ ### if --ignore was specified the start position has already been corrected | |
| 4059 | |
| 4060 if ($cigar){ ### SAM format | |
| 4061 if ($cigar =~ /^(\d+)M$/){ # linear match | |
| 4062 $start += $1 - 1; | |
| 4063 } | |
| 4064 else{ # InDel read | |
| 4065 my $MD_count = 0; | |
| 4066 foreach (@comp_cigar){ | |
| 4067 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
| 4068 } | |
| 4069 $start += $MD_count - 1; | |
| 4070 } | |
| 4071 } | |
| 4072 else{ ### vanilla format | |
| 4073 $start += length($meth_call)-1; | |
| 4074 } | |
| 4075 } | |
| 4076 } | |
| 4077 | |
| 4078 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL) | |
| 4079 | |
| 4080 ### single-file CpG and other-context output | |
| 4081 if ($full and $merge_non_CpG) { | |
| 4082 if ($strand eq '+') { | |
| 4083 for my $index (0..$#methylation_calls) { | |
| 4084 | |
| 4085 if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels | |
| 4086 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 4087 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
| 4088 $cigar_offset += $cigar_mod; | |
| 4089 $pos_offset += $pos_mod; | |
| 4090 } | |
| 4091 | |
| 4092 ### methylated Cs (any context) will receive a forward (+) orientation | |
| 4093 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
| 4094 if ($methylation_calls[$index] eq 'X') { | |
| 4095 $counting{total_meCHG_count}++; | |
| 4096 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4097 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 4098 } | |
| 4099 elsif ($methylation_calls[$index] eq 'x') { | |
| 4100 $counting{total_unmethylated_CHG_count}++; | |
| 4101 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4102 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 4103 } | |
| 4104 elsif ($methylation_calls[$index] eq 'Z') { | |
| 4105 $counting{total_meCpG_count}++; | |
| 4106 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4107 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 4108 } | |
| 4109 elsif ($methylation_calls[$index] eq 'z') { | |
| 4110 $counting{total_unmethylated_CpG_count}++; | |
| 4111 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4112 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 4113 } | |
| 4114 elsif ($methylation_calls[$index] eq 'H') { | |
| 4115 $counting{total_meCHH_count}++; | |
| 4116 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4117 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 4118 } | |
| 4119 elsif ($methylation_calls[$index] eq 'h') { | |
| 4120 $counting{total_unmethylated_CHH_count}++; | |
| 4121 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4122 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 4123 } | |
| 4124 elsif ($methylation_calls[$index] eq '.') {} | |
| 4125 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 4126 else{ | |
| 4127 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 4128 } | |
| 4129 } | |
| 4130 } | |
| 4131 elsif ($strand eq '-') { | |
| 4132 | |
| 4133 for my $index (0..$#methylation_calls) { | |
| 4134 ### methylated Cs (any context) will receive a forward (+) orientation | |
| 4135 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
| 4136 | |
| 4137 if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels | |
| 4138 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
| 4139 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 4140 $cigar_offset += $cigar_mod; | |
| 4141 $pos_offset += $pos_mod; | |
| 4142 } | |
| 4143 | |
| 4144 if ($methylation_calls[$index] eq 'X') { | |
| 4145 $counting{total_meCHG_count}++; | |
| 4146 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4147 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 4148 } | |
| 4149 elsif ($methylation_calls[$index] eq 'x') { | |
| 4150 $counting{total_unmethylated_CHG_count}++; | |
| 4151 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4152 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 4153 } | |
| 4154 elsif ($methylation_calls[$index] eq 'Z') { | |
| 4155 $counting{total_meCpG_count}++; | |
| 4156 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4157 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 4158 } | |
| 4159 elsif ($methylation_calls[$index] eq 'z') { | |
| 4160 $counting{total_unmethylated_CpG_count}++; | |
| 4161 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4162 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 4163 } | |
| 4164 elsif ($methylation_calls[$index] eq 'H') { | |
| 4165 $counting{total_meCHH_count}++; | |
| 4166 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4167 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 4168 } | |
| 4169 elsif ($methylation_calls[$index] eq 'h') { | |
| 4170 $counting{total_unmethylated_CHH_count}++; | |
| 4171 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4172 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 4173 } | |
| 4174 elsif ($methylation_calls[$index] eq '.'){} | |
| 4175 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 4176 else{ | |
| 4177 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 4178 } | |
| 4179 } | |
| 4180 } | |
| 4181 else { | |
| 4182 die "The strand information was neither + nor -: $strand\n"; | |
| 4183 } | |
| 4184 } | |
| 4185 | |
| 4186 ### strand-specific methylation output | |
| 4187 elsif ($merge_non_CpG) { | |
| 4188 if ($strand eq '+') { | |
| 4189 for my $index (0..$#methylation_calls) { | |
| 4190 ### methylated Cs (any context) will receive a forward (+) orientation | |
| 4191 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
| 4192 | |
| 4193 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
| 4194 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 4195 $cigar_offset += $cigar_mod; | |
| 4196 $pos_offset += $pos_mod; | |
| 4197 } | |
| 4198 | |
| 4199 if ($methylation_calls[$index] eq 'X') { | |
| 4200 $counting{total_meCHG_count}++; | |
| 4201 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4202 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 4203 } | |
| 4204 elsif ($methylation_calls[$index] eq 'x') { | |
| 4205 $counting{total_unmethylated_CHG_count}++; | |
| 4206 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4207 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 4208 } | |
| 4209 elsif ($methylation_calls[$index] eq 'Z') { | |
| 4210 $counting{total_meCpG_count}++; | |
| 4211 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4212 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 4213 } | |
| 4214 elsif ($methylation_calls[$index] eq 'z') { | |
| 4215 $counting{total_unmethylated_CpG_count}++; | |
| 4216 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4217 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 4218 } | |
| 4219 elsif ($methylation_calls[$index] eq 'H') { | |
| 4220 $counting{total_meCHH_count}++; | |
| 4221 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4222 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 4223 } | |
| 4224 elsif ($methylation_calls[$index] eq 'h') { | |
| 4225 $counting{total_unmethylated_CHH_count}++; | |
| 4226 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4227 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 4228 } | |
| 4229 elsif ($methylation_calls[$index] eq '.') {} | |
| 4230 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 4231 else{ | |
| 4232 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 4233 } | |
| 4234 } | |
| 4235 } | |
| 4236 elsif ($strand eq '-') { | |
| 4237 | |
| 4238 for my $index (0..$#methylation_calls) { | |
| 4239 ### methylated Cs (any context) will receive a forward (+) orientation | |
| 4240 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
| 4241 | |
| 4242 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
| 4243 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 4244 $cigar_offset += $cigar_mod; | |
| 4245 $pos_offset += $pos_mod; | |
| 4246 } | |
| 4247 | |
| 4248 if ($methylation_calls[$index] eq 'X') { | |
| 4249 $counting{total_meCHG_count}++; | |
| 4250 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4251 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 4252 } | |
| 4253 elsif ($methylation_calls[$index] eq 'x') { | |
| 4254 $counting{total_unmethylated_CHG_count}++; | |
| 4255 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4256 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 4257 } | |
| 4258 elsif ($methylation_calls[$index] eq 'Z') { | |
| 4259 $counting{total_meCpG_count}++; | |
| 4260 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4261 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 4262 } | |
| 4263 elsif ($methylation_calls[$index] eq 'z') { | |
| 4264 $counting{total_unmethylated_CpG_count}++; | |
| 4265 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4266 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 4267 } | |
| 4268 elsif ($methylation_calls[$index] eq 'H') { | |
| 4269 $counting{total_meCHH_count}++; | |
| 4270 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4271 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 4272 } | |
| 4273 elsif ($methylation_calls[$index] eq 'h') { | |
| 4274 $counting{total_unmethylated_CHH_count}++; | |
| 4275 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4276 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 4277 } | |
| 4278 elsif ($methylation_calls[$index] eq '.') {} | |
| 4279 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 4280 else{ | |
| 4281 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 4282 } | |
| 4283 } | |
| 4284 } | |
| 4285 else { | |
| 4286 die "The strand information was neither + nor -: $strand\n"; | |
| 4287 } | |
| 4288 } | |
| 4289 | |
| 4290 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION | |
| 4291 | |
| 4292 elsif ($full) { | |
| 4293 if ($strand eq '+') { | |
| 4294 for my $index (0..$#methylation_calls) { | |
| 4295 ### methylated Cs (any context) will receive a forward (+) orientation | |
| 4296 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
| 4297 | |
| 4298 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
| 4299 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 4300 $cigar_offset += $cigar_mod; | |
| 4301 $pos_offset += $pos_mod; | |
| 4302 } | |
| 4303 | |
| 4304 if ($methylation_calls[$index] eq 'X') { | |
| 4305 $counting{total_meCHG_count}++; | |
| 4306 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4307 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 4308 } | |
| 4309 elsif ($methylation_calls[$index] eq 'x') { | |
| 4310 $counting{total_unmethylated_CHG_count}++; | |
| 4311 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4312 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 4313 } | |
| 4314 elsif ($methylation_calls[$index] eq 'Z') { | |
| 4315 $counting{total_meCpG_count}++; | |
| 4316 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4317 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 4318 } | |
| 4319 elsif ($methylation_calls[$index] eq 'z') { | |
| 4320 $counting{total_unmethylated_CpG_count}++; | |
| 4321 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4322 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 4323 } | |
| 4324 elsif ($methylation_calls[$index] eq 'H') { | |
| 4325 $counting{total_meCHH_count}++; | |
| 4326 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4327 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 4328 } | |
| 4329 elsif ($methylation_calls[$index] eq 'h') { | |
| 4330 $counting{total_unmethylated_CHH_count}++; | |
| 4331 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4332 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 4333 } | |
| 4334 elsif ($methylation_calls[$index] eq '.') {} | |
| 4335 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 4336 else{ | |
| 4337 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
| 4338 } | |
| 4339 } | |
| 4340 } | |
| 4341 elsif ($strand eq '-') { | |
| 4342 | |
| 4343 for my $index (0..$#methylation_calls) { | |
| 4344 ### methylated Cs (any context) will receive a forward (+) orientation | |
| 4345 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
| 4346 | |
| 4347 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
| 4348 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 4349 $cigar_offset += $cigar_mod; | |
| 4350 $pos_offset += $pos_mod; | |
| 4351 } | |
| 4352 | |
| 4353 if ($methylation_calls[$index] eq 'X') { | |
| 4354 $counting{total_meCHG_count}++; | |
| 4355 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4356 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 4357 } | |
| 4358 elsif ($methylation_calls[$index] eq 'x') { | |
| 4359 $counting{total_unmethylated_CHG_count}++; | |
| 4360 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4361 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 4362 } | |
| 4363 elsif ($methylation_calls[$index] eq 'Z') { | |
| 4364 $counting{total_meCpG_count}++; | |
| 4365 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4366 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 4367 } | |
| 4368 elsif ($methylation_calls[$index] eq 'z') { | |
| 4369 $counting{total_unmethylated_CpG_count}++; | |
| 4370 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4371 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 4372 } | |
| 4373 elsif ($methylation_calls[$index] eq 'H') { | |
| 4374 $counting{total_meCHH_count}++; | |
| 4375 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4376 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 4377 } | |
| 4378 elsif ($methylation_calls[$index] eq 'h') { | |
| 4379 $counting{total_unmethylated_CHH_count}++; | |
| 4380 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4381 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 4382 } | |
| 4383 elsif ($methylation_calls[$index] eq '.') {} | |
| 4384 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 4385 else{ | |
| 4386 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 4387 } | |
| 4388 } | |
| 4389 } | |
| 4390 else { | |
| 4391 die "The read had a strand orientation which was neither + nor -: $strand\n"; | |
| 4392 } | |
| 4393 } | |
| 4394 | |
| 4395 ### strand-specific methylation output | |
| 4396 else { | |
| 4397 if ($strand eq '+') { | |
| 4398 for my $index (0..$#methylation_calls) { | |
| 4399 ### methylated Cs (any context) will receive a forward (+) orientation | |
| 4400 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
| 4401 | |
| 4402 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
| 4403 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 4404 $cigar_offset += $cigar_mod; | |
| 4405 $pos_offset += $pos_mod; | |
| 4406 } | |
| 4407 | |
| 4408 if ($methylation_calls[$index] eq 'X') { | |
| 4409 $counting{total_meCHG_count}++; | |
| 4410 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4411 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 4412 } | |
| 4413 elsif ($methylation_calls[$index] eq 'x') { | |
| 4414 $counting{total_unmethylated_CHG_count}++; | |
| 4415 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4416 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 4417 } | |
| 4418 elsif ($methylation_calls[$index] eq 'Z') { | |
| 4419 $counting{total_meCpG_count}++; | |
| 4420 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4421 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 4422 } | |
| 4423 elsif ($methylation_calls[$index] eq 'z') { | |
| 4424 $counting{total_unmethylated_CpG_count}++; | |
| 4425 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4426 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 4427 } | |
| 4428 elsif ($methylation_calls[$index] eq 'H') { | |
| 4429 $counting{total_meCHH_count}++; | |
| 4430 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4431 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 4432 } | |
| 4433 elsif ($methylation_calls[$index] eq 'h') { | |
| 4434 $counting{total_unmethylated_CHH_count}++; | |
| 4435 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4436 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 4437 } | |
| 4438 elsif ($methylation_calls[$index] eq '.') {} | |
| 4439 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 4440 else{ | |
| 4441 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 4442 } | |
| 4443 } | |
| 4444 } | |
| 4445 elsif ($strand eq '-') { | |
| 4446 | |
| 4447 for my $index (0..$#methylation_calls) { | |
| 4448 ### methylated Cs (any context) will receive a forward (+) orientation | |
| 4449 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
| 4450 | |
| 4451 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
| 4452 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
| 4453 $cigar_offset += $cigar_mod; | |
| 4454 $pos_offset += $pos_mod; | |
| 4455 } | |
| 4456 | |
| 4457 if ($methylation_calls[$index] eq 'X') { | |
| 4458 $counting{total_meCHG_count}++; | |
| 4459 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4460 $mbias_1{CHG}->{$index+1}->{meth}++; | |
| 4461 } | |
| 4462 elsif ($methylation_calls[$index] eq 'x') { | |
| 4463 $counting{total_unmethylated_CHG_count}++; | |
| 4464 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4465 $mbias_1{CHG}->{$index+1}->{un}++; | |
| 4466 } | |
| 4467 elsif ($methylation_calls[$index] eq 'Z') { | |
| 4468 $counting{total_meCpG_count}++; | |
| 4469 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4470 $mbias_1{CpG}->{$index+1}->{meth}++; | |
| 4471 } | |
| 4472 elsif ($methylation_calls[$index] eq 'z') { | |
| 4473 $counting{total_unmethylated_CpG_count}++; | |
| 4474 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4475 $mbias_1{CpG}->{$index+1}->{un}++; | |
| 4476 } | |
| 4477 elsif ($methylation_calls[$index] eq 'H') { | |
| 4478 $counting{total_meCHH_count}++; | |
| 4479 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4480 $mbias_1{CHH}->{$index+1}->{meth}++; | |
| 4481 } | |
| 4482 elsif ($methylation_calls[$index] eq 'h') { | |
| 4483 $counting{total_unmethylated_CHH_count}++; | |
| 4484 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); | |
| 4485 $mbias_1{CHH}->{$index+1}->{un}++; | |
| 4486 } | |
| 4487 elsif ($methylation_calls[$index] eq '.') {} | |
| 4488 elsif (lc$methylation_calls[$index] eq 'u'){} | |
| 4489 else{ | |
| 4490 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 4491 } | |
| 4492 } | |
| 4493 } | |
| 4494 else { | |
| 4495 die "The strand information was neither + nor -: $strand\n"; | |
| 4496 } | |
| 4497 } | |
| 4498 } | |
| 4499 | |
| 4500 | |
| 4501 | |
| 4502 sub print_helpfile{ | |
| 4503 | |
| 4504 print << 'HOW_TO'; | |
| 4505 | |
| 4506 | |
| 4507 DESCRIPTION | |
| 4508 | |
| 4509 The following is a brief description of all options to control the Bismark | |
| 4510 methylation extractor. The script reads in a bisulfite read alignment results file | |
| 4511 produced by the Bismark bisulfite mapper and extracts the methylation information | |
| 4512 for individual cytosines. This information is found in the methylation call field | |
| 4513 which can contain the following characters: | |
| 4514 | |
| 4515 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
| 4516 ~~~ X for methylated C in CHG context ~~~ | |
| 4517 ~~~ x for not methylated C CHG ~~~ | |
| 4518 ~~~ H for methylated C in CHH context ~~~ | |
| 4519 ~~~ h for not methylated C in CHH context ~~~ | |
| 4520 ~~~ Z for methylated C in CpG context ~~~ | |
| 4521 ~~~ z for not methylated C in CpG context ~~~ | |
| 4522 ~~~ U for methylated C in Unknown context (CN or CHN ~~~ | |
| 4523 ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~ | |
| 4524 ~~~ . for any bases not involving cytosines ~~~ | |
| 4525 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
| 4526 | |
| 4527 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH | |
| 4528 context (this distinction is actually already made in Bismark itself). As the methylation | |
| 4529 information for every C analysed can produce files which easily have tens or even hundreds of | |
| 4530 millions of lines, file sizes can become very large and more difficult to handle. The C | |
| 4531 methylation info additionally splits cytosine methylation calls up into one of the four possible | |
| 4532 strands a given bisulfite read aligned against: | |
| 4533 | |
| 4534 OT original top strand | |
| 4535 CTOT complementary to original top strand | |
| 4536 | |
| 4537 OB original bottom strand | |
| 4538 CTOB complementary to original bottom strand | |
| 4539 | |
| 4540 Thus, by default twelve individual output files are being generated per input file (unless | |
| 4541 --comprehensive is specified, see below). The output files can be imported into a genome | |
| 4542 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact | |
| 4543 unless the bisulfite reads were generated preserving directionality it doesn't make any | |
| 4544 sense to look at the data in a strand-specific manner). Strand-specific output files can | |
| 4545 optionally be skipped, in which case only three output files for CpG, CHG or CHH context | |
| 4546 will be generated. For both the strand-specific and comprehensive outputs there is also | |
| 4547 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context. | |
| 4548 | |
| 4549 | |
| 4550 The output files are in the following format (tab delimited): | |
| 4551 | |
| 4552 <sequence_id> <strand> <chromosome> <position> <methylation call> | |
| 4553 | |
| 4554 | |
| 4555 USAGE: methylation_extractor [options] <filenames> | |
| 4556 | |
| 4557 | |
| 4558 ARGUMENTS: | |
| 4559 ========== | |
| 4560 | |
| 4561 <filenames> A space-separated list of Bismark result files in SAM format from | |
| 4562 which methylation information is extracted for every cytosine in | |
| 4563 the reads. For alignment files in the older custom Bismark output | |
| 4564 see option '--vanilla'. | |
| 4565 | |
| 4566 OPTIONS: | |
| 4567 | |
| 4568 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end | |
| 4569 read data. Specifying either --single-end or --paired-end is | |
| 4570 mandatory. | |
| 4571 | |
| 4572 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end | |
| 4573 read data. Specifying either --paired-end or --single-end is | |
| 4574 mandatory. | |
| 4575 | |
| 4576 --vanilla The Bismark result input file(s) are in the old custom Bismark format | |
| 4577 (up to version 0.5.x) and not in SAM format which is the default as | |
| 4578 of Bismark version 0.6.x or higher. Default: OFF. | |
| 4579 | |
| 4580 --no_overlap For paired-end reads it is theoretically possible that read_1 and | |
| 4581 read_2 overlap. This option avoids scoring overlapping methylation | |
| 4582 calls twice (only methylation calls of read 1 are used for in the process | |
| 4583 since read 1 has historically higher quality basecalls than read 2). | |
| 4584 Whilst this option removes a bias towards more methylation calls | |
| 4585 in the center of sequenced fragments it may de facto remove a sizable | |
| 4586 proportion of the data. This option is highly recommended for paired-end | |
| 4587 data. | |
| 4588 | |
| 4589 --ignore <int> Ignore the first <int> bp from the 5' end of Read 1 when processing the | |
| 4590 methylation call string. This can remove e.g. a restriction enzyme site | |
| 4591 at the start of each read or any other source of bias (e.g. PBAT-Seq data). | |
| 4592 | |
| 4593 --ignore_r2 <int> Ignore the first <int> bp from the 5' end of Read 2 of paired-end sequencing | |
| 4594 results only. Since the first couple of bases in Read 2 of BS-Seq experiments | |
| 4595 show a severe bias towards non-methylation as a result of end-repairing | |
| 4596 sonicated fragments with unmethylated cytosines (see M-bias plot), it is | |
| 4597 recommended that the first couple of bp of Read 2 are removed before | |
| 4598 starting downstream analysis. Please see the section on M-bias plots in the | |
| 4599 Bismark User Guide for more details. | |
| 4600 | |
| 4601 --comprehensive Specifying this option will merge all four possible strand-specific | |
| 4602 methylation info into context-dependent output files. The default | |
| 4603 | |
| 4604 contexts are: | |
| 4605 - CpG context | |
| 4606 - CHG context | |
| 4607 - CHH context | |
| 4608 | |
| 4609 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight | |
| 4610 strand-specific output files (default) for Cs in | |
| 4611 - CpG context | |
| 4612 - non-CpG context | |
| 4613 | |
| 4614 --report Prints out a short methylation summary as well as the paramaters used to run | |
| 4615 this script. | |
| 4616 | |
| 4617 --no_header Suppresses the Bismark version header line in all output files for more convenient | |
| 4618 batch processing. | |
| 4619 | |
| 4620 -o/--output DIR Allows specification of a different output directory (absolute or relative | |
| 4621 path). If not specified explicitely, the output will be written to the current directory. | |
| 4622 | |
| 4623 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified | |
| 4624 explicitly if Samtools is in the PATH already. | |
| 4625 | |
| 4626 --gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in | |
| 4627 a GZIP compressed form to save disk space. This option does not work on bedGraph and | |
| 4628 genome-wide cytosine reports as they are 'tiny' anyway. | |
| 4629 | |
| 4630 --version Displays version information. | |
| 4631 | |
| 4632 -h/--help Displays this help file and exits. | |
| 4633 | |
| 4634 --mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as | |
| 4635 well as a report (optional) and then quit. Default: OFF. | |
| 4636 | |
| 4637 | |
| 4638 | |
| 4639 bedGraph specific options: | |
| 4640 ========================== | |
| 4641 | |
| 4642 --bedGraph After finishing the methylation extraction, the methylation output is written into a | |
| 4643 sorted bedGraph file that reports the position of a given cytosine and its methylation | |
| 4644 state (in %, see details below). The methylation extractor output is temporarily split up into | |
| 4645 temporary files, one per chromosome (written into the current directory or folder | |
| 4646 specified with -o/--output); these temp files are then used for sorting and deleted | |
| 4647 afterwards. By default, only cytosines in CpG context will be sorted. The option | |
| 4648 '--CX_context' may be used to report all cytosines irrespective of sequence context | |
| 4649 (this will take MUCH longer!). The default folder for temporary files during the sorting | |
| 4650 process is the output directory. The bedGraph conversion step is performed by the external | |
| 4651 module 'bismark2bedGraph'; this script needs to reside in the same folder as the | |
| 4652 bismark_methylation_extractor itself. | |
| 4653 | |
| 4654 | |
| 4655 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide | |
| 4656 before its methylation percentage is reported. Default: 1. | |
| 4657 | |
| 4658 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. | |
| 4659 | |
| 4660 | |
| 4661 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered | |
| 4662 in the experiment irrespective of its sequence context. This applies to both forward and | |
| 4663 reverse strands. Please be aware that this option may generate large temporary and output files | |
| 4664 and may take a long time to sort (up to many hours). Default: OFF. | |
| 4665 (i.e. Default = CpG context only). | |
| 4666 | |
| 4667 --buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information. | |
| 4668 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or | |
| 4669 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. | |
| 4670 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. | |
| 4671 Defaults to 2G. | |
| 4672 | |
| 4673 --scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of | |
| 4674 scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to | |
| 4675 individual chromosome files. These errors were caused by the operating system's limit | |
| 4676 of the number of filehandle that can be written to at any one time (typically 1024; to | |
| 4677 find out this limit on Linux, type: ulimit -a). | |
| 4678 To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort | |
| 4679 methylation calls into individual chromosome files. Instead, all input files are | |
| 4680 temporarily merged into a single file (unless there is only a single file), and this | |
| 4681 file will then be sorted by both chromosome AND position using the Unix sort command. | |
| 4682 Please be aware that this option might take a looooong time to complete, depending on | |
| 4683 the size of the input files, and the memory you allocate to this process (see --buffer_size). | |
| 4684 Nevertheless, it seems to be working. | |
| 4685 | |
| 4686 --ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will | |
| 4687 instead use two arrays to sort methylated and unmethylated calls. This may result in a faster | |
| 4688 sorting process of very large files, but this comes at the cost of a larger memory footprint | |
| 4689 (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB | |
| 4690 of RAM). Due to overheads in creating and looping through these arrays it seems that it will | |
| 4691 actually be *slower* for small files (few million alignments), and we are currently testing at | |
| 4692 which point it is advisable to use this option. Note that --ample_memory is not compatible | |
| 4693 with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with). | |
| 4694 | |
| 4695 | |
| 4696 | |
| 4697 Genome-wide cytosine methylation report specific options: | |
| 4698 ========================================================= | |
| 4699 | |
| 4700 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a | |
| 4701 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based | |
| 4702 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all | |
| 4703 cytosine context is optional). The output considers all Cs on both forward and reverse strands and | |
| 4704 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not | |
| 4705 covered). The cytsoine report conversion step is performed by the external module | |
| 4706 'bedGraph2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor | |
| 4707 itself. | |
| 4708 | |
| 4709 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of | |
| 4710 its context. This applies to both forward and reverse strands. Please be aware that this will | |
| 4711 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. | |
| 4712 Default: OFF (i.e. Default = CpG context only). | |
| 4713 | |
| 4714 --zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF. | |
| 4715 | |
| 4716 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted | |
| 4717 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. | |
| 4718 | |
| 4719 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files | |
| 4720 will be named to include the input filename and the chromosome number. | |
| 4721 | |
| 4722 | |
| 4723 | |
| 4724 OUTPUT: | |
| 4725 | |
| 4726 The bismark_methylation_extractor output is in the form: | |
| 4727 ======================================================== | |
| 4728 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call> | |
| 4729 | |
| 4730 * Methylated cytosines receive a '+' orientation, | |
| 4731 * Unmethylated cytosines receive a '-' orientation. | |
| 4732 | |
| 4733 | |
| 4734 | |
| 4735 The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords): | |
| 4736 ========================================================================================================= | |
| 4737 | |
| 4738 track type=bedGraph (header line) | |
| 4739 | |
| 4740 <chromosome> <start position> <end position> <methylation percentage> | |
| 4741 | |
| 4742 | |
| 4743 | |
| 4744 The coverage output looks like this (tab-delimited, 1-based genomic coords): | |
| 4745 ============================================================================ | |
| 4746 | |
| 4747 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated> | |
| 4748 | |
| 4749 | |
| 4750 | |
| 4751 The genome-wide cytosine methylation output file is tab-delimited in the following format: | |
| 4752 ========================================================================================== | |
| 4753 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> | |
| 4754 | |
| 4755 | |
| 4756 | |
| 4757 This script was last modified on 25 November 2013. | |
| 4758 | |
| 4759 HOW_TO | |
| 4760 } |
