Mercurial > repos > bgruening > bismark
comparison bismark_methyl_extractor/bismark_methylation_extractor @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author | bgruening |
---|---|
date | Sat, 06 May 2017 13:18:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:0f8646f22b8d | 7:fcadce4d9a06 |
---|---|
1 #!/usr/bin/env perl | |
2 use warnings; | |
3 use strict; | |
4 $|++; | |
5 use Getopt::Long; | |
6 use Cwd; | |
7 use Carp; | |
8 use FindBin qw($Bin); | |
9 use lib "$Bin/../lib"; | |
10 | |
11 ## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) | |
12 | |
13 ## This program is free software: you can redistribute it and/or modify | |
14 ## it under the terms of the GNU General Public License as published by | |
15 ## the Free Software Foundation, either version 3 of the License, or | |
16 ## (at your option) any later version. | |
17 | |
18 ## This program is distributed in the hope that it will be useful, | |
19 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 ## GNU General Public License for more details. | |
22 | |
23 ## You should have received a copy of the GNU General Public License | |
24 ## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
25 | |
26 my @filenames; # input files | |
27 my %counting; | |
28 my $parent_dir = getcwd(); | |
29 | |
30 my %fhs; | |
31 | |
32 my $version = 'v0.16.3'; | |
33 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore) = process_commandline(); | |
34 | |
35 | |
36 ### only needed for bedGraph output | |
37 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files | |
38 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total | |
39 my @bedfiles; | |
40 | |
41 ### only needed for genome-wide cytosine methylation report | |
42 my %chromosomes; | |
43 | |
44 my %mbias_1; | |
45 my %mbias_2; | |
46 | |
47 | |
48 ############################################################################################## | |
49 ### Summarising Run Parameters | |
50 ############################################################################################## | |
51 | |
52 ### METHYLATION EXTRACTOR | |
53 | |
54 warn "Summarising Bismark methylation extractor parameters:\n"; | |
55 warn '='x63,"\n"; | |
56 | |
57 if ($single){ | |
58 if ($vanilla){ | |
59 warn "Bismark single-end vanilla format specified\n"; | |
60 } | |
61 else{ | |
62 warn "Bismark single-end SAM format specified (default)\n"; # default | |
63 } | |
64 } | |
65 elsif ($paired){ | |
66 if ($vanilla){ | |
67 warn "Bismark paired-end vanilla format specified\n"; | |
68 } | |
69 else{ | |
70 warn "Bismark paired-end SAM format specified (default)\n"; # default | |
71 } | |
72 } | |
73 | |
74 warn "Number of cores to be used: $multicore\n"; | |
75 | |
76 if ($single){ | |
77 if ($ignore){ | |
78 warn "First $ignore bp will be disregarded when processing the methylation call string\n"; | |
79 } | |
80 if ($ignore_3prime){ | |
81 warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string\n"; | |
82 } | |
83 | |
84 } | |
85 else{ ## paired-end | |
86 if ($ignore){ | |
87 warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n"; | |
88 } | |
89 if ($ignore_r2){ | |
90 warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n"; | |
91 } | |
92 | |
93 if ($ignore_3prime){ | |
94 warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string of Read 1\n"; | |
95 } | |
96 if ($ignore_3prime_r2){ | |
97 warn "Last $ignore_3prime_r2 bp will be disregarded when processing the methylation call string of Read 2\n"; | |
98 } | |
99 | |
100 | |
101 } | |
102 | |
103 | |
104 if ($full){ | |
105 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; | |
106 } | |
107 if ($merge_non_CpG){ | |
108 warn "Merge CHG and CHH context to non-CpG context specified\n"; | |
109 } | |
110 ### output directory | |
111 if ($output_dir eq ''){ | |
112 warn "Output will be written to the current directory ('$parent_dir')\n"; | |
113 } | |
114 else{ | |
115 warn "Output path specified as: $output_dir\n"; | |
116 } | |
117 | |
118 | |
119 sleep (1); | |
120 | |
121 ### BEDGRAPH | |
122 | |
123 if ($bedGraph){ | |
124 warn "\n\nSummarising bedGraph parameters:\n"; | |
125 warn '='x63,"\n"; | |
126 | |
127 if ($counts){ | |
128 warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage>\ncoverage format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>\n\n"; | |
129 } | |
130 else{ | |
131 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n"; | |
132 } | |
133 | |
134 ### Zero-based coordinates | |
135 if ($zero){ | |
136 warn "Writing out an additional coverage file (ending in zero.cov) with 0-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n"; | |
137 } | |
138 | |
139 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n"; | |
140 | |
141 if ($CX_context){ | |
142 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n"; | |
143 } | |
144 else{ # default | |
145 $CpG_only = 1; | |
146 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n"; | |
147 } | |
148 | |
149 if ($remove){ | |
150 warn "White spaces in read ID names will be removed prior to sorting\n"; | |
151 } | |
152 | |
153 if ($ample_mem){ | |
154 warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n"; | |
155 } | |
156 elsif (defined $sort_size){ | |
157 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n"; | |
158 } | |
159 else{ | |
160 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n"; | |
161 } | |
162 | |
163 | |
164 | |
165 sleep (1); | |
166 | |
167 if ($cytosine_report){ | |
168 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n"; | |
169 warn '='x63,"\n"; | |
170 warn "Generating comprehensive genome-wide cytosine report\n(output format: <Chromosome> <Position> <Strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> )\n"; | |
171 | |
172 | |
173 if ($CX_context){ | |
174 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n"; | |
175 } | |
176 else{ # default | |
177 $CpG_only = 1; | |
178 warn "Reporting cytosine methylation in CpG context only (default)\n"; | |
179 } | |
180 | |
181 if ($split_by_chromosome){ | |
182 warn "Splitting the cytosine report output up into individual files for each chromosome\n"; | |
183 } | |
184 | |
185 ### Zero-based coordinates | |
186 if ($zero){ | |
187 warn "Using zero-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n"; | |
188 } | |
189 else{ # default, 1-based coords | |
190 warn "Using 1-based genomic coordinates (default)\n"; | |
191 } | |
192 | |
193 ### GENOME folder | |
194 if ($genome_folder){ | |
195 unless ($genome_folder =~/\/$/){ | |
196 $genome_folder =~ s/$/\//; | |
197 } | |
198 warn "Genome folder was specified as $genome_folder\n"; | |
199 } | |
200 else{ | |
201 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/'; | |
202 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n"; | |
203 } | |
204 sleep (1); | |
205 } | |
206 } | |
207 | |
208 warn "\n"; | |
209 sleep (1); | |
210 | |
211 ###################################################### | |
212 ### PROCESSING FILES | |
213 ###################################################### | |
214 | |
215 foreach my $filename (@filenames){ | |
216 # resetting counters and filehandles | |
217 %fhs = (); | |
218 %counting =( | |
219 total_meCHG_count => 0, | |
220 total_meCHH_count => 0, | |
221 total_meCpG_count => 0, | |
222 total_unmethylated_CHG_count => 0, | |
223 total_unmethylated_CHH_count => 0, | |
224 total_unmethylated_CpG_count => 0, | |
225 sequences_count => 0, | |
226 methylation_call_strings => 0, | |
227 ); | |
228 | |
229 @sorting_files = (); | |
230 @bedfiles = (); | |
231 | |
232 %mbias_1 = (); | |
233 %mbias_2 = (); | |
234 | |
235 | |
236 ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor | |
237 unless ($vanilla){ | |
238 if ($paired){ | |
239 test_positional_sorting($filename); | |
240 } | |
241 } | |
242 | |
243 my ($pid,$pids,$report_basename) = process_Bismark_results_file($filename); | |
244 | |
245 if ($pid == 0){ | |
246 warn "Finished processing child process. Exiting..\n"; | |
247 | |
248 # ### Closing all filehandles of the child process so that the Bismark methylation extractor output doesn't get truncated due to buffering issues | |
249 # foreach my $fh (keys %fhs) { | |
250 # if ($fh =~ /^[1230]$/) { | |
251 # foreach my $context (keys %{$fhs{$fh}}) { | |
252 # $fhs{$fh}->{$context}->flush; | |
253 # } | |
254 # } | |
255 # else{ | |
256 # $fhs{$fh}->flush; | |
257 # } | |
258 # } | |
259 exit 0; | |
260 } | |
261 | |
262 ### | |
263 if ($pid and $multicore > 1){ | |
264 warn "Now waiting for all child processes to complete\n"; | |
265 sleep(1); | |
266 | |
267 ### we need to ensure that we wait for all child processes to be finished before continuing | |
268 # warn "here are the child IDs: @$pids\n"; | |
269 # warn "Looping through the child process IDs:\n"; | |
270 | |
271 foreach my $id (@$pids){ | |
272 # print "$id\t"; | |
273 my $kid = waitpid ($id,0); | |
274 # print "Returned: $kid\nExit status: $?\n"; | |
275 unless ($? == 0){ | |
276 warn "\nChild process terminated with exit signal: '$?'\n\n"; | |
277 } | |
278 } | |
279 } | |
280 | |
281 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues | |
282 foreach my $fh (keys %fhs) { | |
283 if ($fh =~ /^[1230]$/) { | |
284 foreach my $context (keys %{$fhs{$fh}}) { | |
285 close $fhs{$fh}->{$context} or die $!; | |
286 } | |
287 } | |
288 else{ | |
289 close $fhs{$fh} or die $!; | |
290 } | |
291 } | |
292 | |
293 ### We need to stitch together a main splitting report from all individual parent/child processes | |
294 if ($multicore > 1){ | |
295 merge_individual_splitting_reports($report_basename); | |
296 print_splitting_report(); | |
297 | |
298 merge_individual_mbias_reports($report_basename); # this updates the main %mbias_1 and %mbias_2 data structures so we can proceed normally | |
299 | |
300 } | |
301 | |
302 unless ($mbias_off){ | |
303 ### printing out all M-Bias data | |
304 produce_mbias_plots ($filename); produce_mbias_plots ($filename); | |
305 | |
306 } | |
307 | |
308 unless ($mbias_only){ | |
309 delete_unused_files(); | |
310 } | |
311 | |
312 if ($bedGraph){ | |
313 | |
314 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified | |
315 $out =~ s/gz$//; | |
316 $out =~ s/sam$//; | |
317 $out =~ s/bam$//; | |
318 $out =~ s/txt$//; | |
319 $out =~ s/$/bedGraph/; | |
320 | |
321 my $bedGraph_output = $out; | |
322 my @args; | |
323 | |
324 if ($remove){ | |
325 push @args, '--remove'; | |
326 } | |
327 if ($CX_context){ | |
328 push @args, '--CX_context'; | |
329 } | |
330 if ($no_header){ | |
331 push @args, '--no_header'; | |
332 } | |
333 if ($gazillion){ | |
334 push @args, '--gazillion'; | |
335 } | |
336 if ($ample_mem){ | |
337 push @args, '--ample_memory'; | |
338 } | |
339 if ($zero){ | |
340 push @args, "--zero"; | |
341 } | |
342 | |
343 # if ($counts){ | |
344 # push @args, "--counts"; | |
345 # } | |
346 | |
347 push @args, "--buffer_size $sort_size"; | |
348 push @args, "--cutoff $coverage_threshold"; | |
349 push @args, "--output $bedGraph_output"; | |
350 push @args, "--dir '$output_dir'"; | |
351 | |
352 ### adding all files to be sorted to @args | |
353 foreach my $f (@sorting_files){ | |
354 push @args, $f; | |
355 } | |
356 | |
357 # print join "\t",@args,"\n"; | |
358 | |
359 system ("$Bin/bismark2bedGraph @args"); | |
360 | |
361 warn "Finished BedGraph conversion ...\n\n"; | |
362 sleep(1); | |
363 | |
364 # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!"; | |
365 # warn "Writing bedGraph to file: $bedGraph_output\n"; | |
366 # process_bedGraph_output(); | |
367 # close OUT or die $!; | |
368 | |
369 ### genome-wide cytosine methylation report requires bedGraph processing anyway | |
370 if ($cytosine_report){ | |
371 | |
372 @args = (); # resetting @args | |
373 my $cytosine_out = $out; | |
374 $cytosine_out =~ s/bedGraph$//; | |
375 | |
376 if ($CX_context){ | |
377 $cytosine_out =~ s/$/CX_report.txt/; | |
378 } | |
379 else{ | |
380 $cytosine_out =~ s/$/CpG_report.txt/; | |
381 } | |
382 | |
383 push @args, "--output $cytosine_out"; | |
384 push @args, "--dir '$output_dir'"; | |
385 push @args, "--genome '$genome_folder'"; | |
386 push @args, "--parent_dir '$parent_dir'"; | |
387 | |
388 if ($zero){ | |
389 push @args, "--zero"; | |
390 } | |
391 if ($CX_context){ | |
392 push @args, '--CX_context'; | |
393 } | |
394 if ($split_by_chromosome){ | |
395 push @args, '--split_by_chromosome'; | |
396 } | |
397 if ($gzip){ | |
398 push @args, '--gzip'; | |
399 } | |
400 | |
401 my $coverage_output = $bedGraph_output; | |
402 $coverage_output =~ s/bedGraph$/bismark.cov.gz/; | |
403 | |
404 push @args, $coverage_output; # this will be the infile | |
405 # warn "Handing over the following infile: $coverage_output\n"; sleep (5); | |
406 system ("$Bin/coverage2cytosine @args"); | |
407 | |
408 warn "\n\nFinished generating genome-wide cytosine report\n\n"; | |
409 } | |
410 } | |
411 } | |
412 | |
413 sub merge_individual_splitting_reports{ | |
414 | |
415 my $report_basename = shift; | |
416 | |
417 my @splitting_reports; # only needed in multi-core mode to generate an overall report | |
418 foreach my $ext (1..$multicore){ | |
419 push @splitting_reports, "$report_basename.$ext"; | |
420 } | |
421 warn "\nMerging individual splitting reports into overall report: '$report_basename'\n"; | |
422 warn "Merging from these individual files:\n"; | |
423 print join ("\n",@splitting_reports),"\n\n"; | |
424 sleep(1); | |
425 | |
426 ########## | |
427 # resetting the counter first | |
428 %counting =( | |
429 total_meCHG_count => 0, | |
430 total_meCHH_count => 0, | |
431 total_meCpG_count => 0, | |
432 total_unmethylated_CHG_count => 0, | |
433 total_unmethylated_CHH_count => 0, | |
434 total_unmethylated_CpG_count => 0, | |
435 sequences_count => 0, | |
436 methylation_call_strings => 0, | |
437 ); | |
438 | |
439 # repopulating the merged counter | |
440 foreach my $file (@splitting_reports){ | |
441 open (IR,$file) or die $!; | |
442 while (<IR>){ | |
443 chomp; | |
444 my ($context,$count) = (split /\t/); | |
445 if ($context){ | |
446 if ($context =~ /^Total C to T conversions in CpG context/){ | |
447 # warn "context: $context\ncount: $count\n"; | |
448 $counting{total_unmethylated_CpG_count} += $count; | |
449 } | |
450 elsif ($context =~ /^Total C to T conversions in CHG context/){ | |
451 # warn "context: $context\ncount: $count\n"; | |
452 $counting{total_unmethylated_CHG_count} += $count; | |
453 } | |
454 elsif ($context =~ /^Total C to T conversions in CHH context/){ | |
455 # warn "context: $context\ncount: $count\n"; | |
456 $counting{total_unmethylated_CHH_count} += $count; | |
457 } | |
458 elsif ($context =~ /^Total methylated C\'s in CpG context/){ | |
459 # warn "context: $context\ncount: $count\n"; | |
460 $counting{total_meCpG_count} += $count; | |
461 } | |
462 elsif ($context =~ /^Total methylated C\'s in CHG context/){ | |
463 # warn "context: $context\ncount: $count\n"; | |
464 $counting{total_meCHG_count} += $count; | |
465 } | |
466 elsif ($context =~ /^Total methylated C\'s in CHH context/){ | |
467 # warn "context: $context\ncount: $count\n"; | |
468 $counting{total_meCHH_count} += $count; | |
469 } | |
470 elsif ($context =~ /^line count/){ | |
471 # warn "Line count\ncount: $count\n"; | |
472 $counting{sequences_count} = $count; # always the same | |
473 } | |
474 elsif ($context =~ /^meth call strings/){ | |
475 # warn "Meth call strings\ncount: $count\n"; | |
476 $counting{methylation_call_strings} += $count; | |
477 } | |
478 } | |
479 } | |
480 } | |
481 | |
482 # deleting the individual reports afterwards | |
483 foreach my $file (@splitting_reports){ | |
484 unlink $file; | |
485 } | |
486 } | |
487 | |
488 | |
489 sub merge_individual_mbias_reports{ | |
490 | |
491 my $report_basename = shift; | |
492 | |
493 my @mbias_reports; # only needed in multi-core mode to generate an overall report | |
494 foreach my $ext (1..$multicore){ | |
495 push @mbias_reports, "$report_basename.${ext}.mbias"; | |
496 } | |
497 warn "\nMerging individual M-bias reports into overall M-bias statistics from these $multicore individual files:\n"; | |
498 print join ("\n",@mbias_reports),"\n\n"; | |
499 | |
500 | |
501 ########## | |
502 # resetting the counters first, then repopulating them | |
503 %mbias_1 = (); | |
504 %mbias_2 = (); | |
505 | |
506 # repopulating the merged counter | |
507 foreach my $file (@mbias_reports){ | |
508 open (IR,$file) or die $!; | |
509 | |
510 my $context; | |
511 my $read; | |
512 | |
513 while (<IR>){ | |
514 chomp; | |
515 # warn "$_\n"; sleep(1); | |
516 if ($_ =~ /context/){ | |
517 $context = $1 if ($_ =~ /(\D{3}) context/); | |
518 # warn "Context set as $context\n"; | |
519 | |
520 if ($_ =~ /R2/){ | |
521 $read = 'R2'; | |
522 } | |
523 else{ | |
524 $read = 'R1'; | |
525 } | |
526 # warn "Setting read identity to '$read'\n"; | |
527 | |
528 # reading in 2 additional lines (===========, and header line) | |
529 $_ = <IR>; | |
530 #warn "discarding line $_\n"; | |
531 $_ = <IR>; | |
532 #warn "discarding line $_\n"; | |
533 next; | |
534 } | |
535 else{ | |
536 | |
537 if ($_ eq ''){ | |
538 # empty line, only occurs after a context has finished and before a new context starts | |
539 next; | |
540 } | |
541 | |
542 my ($pos,$meth,$unmeth) = (split /\t/); | |
543 # warn "$pos\t$meth\t$unmeth\n"; sleep(1); | |
544 if ($read eq 'R1'){ | |
545 $mbias_1{$context}->{$pos}->{meth} += $meth; | |
546 $mbias_1{$context}->{$pos}->{un} += $unmeth; | |
547 } | |
548 elsif ($read eq 'R2'){ | |
549 $mbias_2{$context}->{$pos}->{meth} += $meth; | |
550 $mbias_2{$context}->{$pos}->{un} += $unmeth; | |
551 } | |
552 } | |
553 } | |
554 close IR or warn "Had trouble closing filehandle for $file: $!\n"; | |
555 } | |
556 | |
557 # deleting the individual reports afterwards | |
558 foreach my $file (@mbias_reports){ | |
559 unlink $file; | |
560 } | |
561 } | |
562 | |
563 | |
564 sub delete_unused_files{ | |
565 | |
566 warn "Deleting unused files ...\n\n"; sleep(1); | |
567 | |
568 my $index = 0; | |
569 | |
570 while ($index <= $#sorting_files){ | |
571 if ($sorting_files[$index] =~ /gz$/){ | |
572 open (USED,"gunzip -c $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; | |
573 } | |
574 else{ | |
575 open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; | |
576 } | |
577 | |
578 my $used = 0; | |
579 | |
580 while (<USED>){ | |
581 next if (/^Bismark/); | |
582 if ($_){ | |
583 $used = 1; | |
584 last; | |
585 } | |
586 } | |
587 | |
588 if ($used){ | |
589 warn "$sorting_files[$index] contains data ->\tkept\n"; | |
590 ++$index; | |
591 } | |
592 else{ | |
593 | |
594 my $delete = unlink $sorting_files[$index]; | |
595 | |
596 if ($delete){ | |
597 warn "$sorting_files[$index] was empty ->\tdeleted\n"; | |
598 } | |
599 else{ | |
600 warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n" | |
601 } | |
602 | |
603 ### we also need to remove the element from @sorting_files | |
604 splice @sorting_files, $index, 1; | |
605 } | |
606 } | |
607 warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately) | |
608 } | |
609 | |
610 sub produce_mbias_plots{ | |
611 | |
612 my $filename = shift; | |
613 | |
614 my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified | |
615 $mbias =~ s/gz$//; | |
616 $mbias =~ s/sam$//; | |
617 $mbias =~ s/bam$//; | |
618 $mbias =~ s/cram$//; | |
619 $mbias =~ s/txt$//; | |
620 my $mbias_graph_1 = my $mbias_graph_2 = $mbias; | |
621 $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png'; | |
622 $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png'; | |
623 | |
624 $mbias =~ s/$/M-bias.txt/; | |
625 | |
626 open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n"; | |
627 | |
628 # determining maximum read length | |
629 my $max_length_1 = 0; | |
630 my $max_length_2 = 0; | |
631 | |
632 foreach my $context (keys %mbias_1){ | |
633 foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){ | |
634 $max_length_1 = $pos unless ($max_length_1 >= $pos); | |
635 } | |
636 } | |
637 if ($paired){ | |
638 foreach my $context (keys %mbias_2){ | |
639 foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){ | |
640 $max_length_2 = $pos unless ($max_length_2 >= $pos); | |
641 } | |
642 } | |
643 } | |
644 | |
645 if ($single){ | |
646 warn "Determining maximum read length for M-Bias plot\n"; | |
647 warn "Maximum read length of Read 1: $max_length_1\n\n"; | |
648 } | |
649 else{ | |
650 warn "Determining maximum read lengths for M-Bias plots\n"; | |
651 warn "Maximum read length of Read 1: $max_length_1\n"; | |
652 warn "Maximum read length of Read 2: $max_length_2\n\n"; | |
653 } | |
654 # sleep(3); | |
655 | |
656 my @mbias_read1; | |
657 my @mbias_read2; | |
658 | |
659 #Check whether the module GD::Graph:lines is installed | |
660 my $gd_graph_installed = 0; | |
661 eval{ | |
662 require GD::Graph::lines; | |
663 GD::Graph::lines->import(); | |
664 }; | |
665 | |
666 unless($@) { # syntax or routine error variable, set if something goes wrong in the last eval{ require ...} | |
667 $gd_graph_installed = 1; | |
668 | |
669 #Check whether the module GD::Graph::colour is installed | |
670 eval{ | |
671 require GD::Graph::colour; | |
672 GD::Graph::colour->import(qw(:colours :lists :files :convert)); | |
673 }; | |
674 | |
675 if ($@) { | |
676 warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; | |
677 sleep(2); | |
678 $gd_graph_installed = 0; | |
679 } | |
680 | |
681 | |
682 } | |
683 else{ | |
684 warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; | |
685 sleep(2); | |
686 } | |
687 | |
688 | |
689 my $graph_title; | |
690 my $graph1; | |
691 my $graph2; | |
692 | |
693 if ( $gd_graph_installed){ | |
694 $graph1 = GD::Graph::lines->new(800,600); | |
695 if ($paired){ | |
696 $graph2 = GD::Graph::lines->new(800,600); | |
697 } | |
698 } | |
699 | |
700 foreach my $context (qw(CpG CHG CHH)){ | |
701 @{$mbias_read1[0]} = (); | |
702 | |
703 if ($paired){ | |
704 print MBIAS "$context context (R1)\n================\n"; | |
705 $graph_title = 'M-bias (Read 1)'; | |
706 } | |
707 else{ | |
708 print MBIAS "$context context\n===========\n"; | |
709 $graph_title = 'M-bias'; | |
710 } | |
711 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; | |
712 | |
713 foreach my $pos (1..$max_length_1){ | |
714 | |
715 unless (defined $mbias_1{$context}->{$pos}->{meth}){ | |
716 $mbias_1{$context}->{$pos}->{meth} = 0; | |
717 } | |
718 unless (defined $mbias_1{$context}->{$pos}->{un}){ | |
719 $mbias_1{$context}->{$pos}->{un} = 0; | |
720 } | |
721 | |
722 my $percent = ''; | |
723 if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){ | |
724 $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) ); | |
725 } | |
726 my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth}; | |
727 | |
728 print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n"; | |
729 push @{$mbias_read1[0]},$pos; | |
730 | |
731 if ($context eq 'CpG'){ | |
732 push @{$mbias_read1[1]},$percent; | |
733 push @{$mbias_read1[4]},$coverage; | |
734 } | |
735 elsif ($context eq 'CHG'){ | |
736 push @{$mbias_read1[2]},$percent; | |
737 push @{$mbias_read1[5]},$coverage; | |
738 } | |
739 elsif ($context eq 'CHH'){ | |
740 push @{$mbias_read1[3]},$percent; | |
741 push @{$mbias_read1[6]},$coverage; | |
742 } | |
743 } | |
744 print MBIAS "\n"; | |
745 } | |
746 | |
747 if ( $gd_graph_installed){ | |
748 | |
749 add_colour(nice_blue => [31,120,180]); | |
750 add_colour(nice_orange => [255,127,0]); | |
751 add_colour(nice_green => [51,160,44]); | |
752 add_colour(pale_blue => [153,206,227]); | |
753 add_colour(pale_orange => [253,204,138]); | |
754 add_colour(pale_green => [191,230,207]); | |
755 | |
756 $graph1->set( | |
757 x_label => 'position (bp)', | |
758 y1_label => '% methylation', | |
759 y2_label => '# methylation calls', | |
760 title => $graph_title, | |
761 line_width => 2, | |
762 x_max_value => $max_length_1, | |
763 x_min_value => 0, | |
764 y_tick_number => 10, | |
765 y_label_skip => 2, | |
766 y1_max_value => 100, | |
767 y1_min_value => 0, | |
768 y_label_skip => 2, | |
769 y2_min_value => 0, | |
770 x_label_skip => 5, | |
771 x_label_position => 0.5, | |
772 x_tick_offset => -1, | |
773 bgclr => 'white', | |
774 transparent => 0, | |
775 two_axes => 1, | |
776 use_axis => [1,1,1,2,2,2], | |
777 legend_placement => 'RC', | |
778 legend_spacing => 6, | |
779 legend_marker_width => 24, | |
780 legend_marker_height => 18, | |
781 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], | |
782 ) or die $graph1->error; | |
783 | |
784 $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); | |
785 | |
786 ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014) | |
787 if (my $gd1 = $graph1->plot(\@mbias_read1)) { | |
788 open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n"; | |
789 binmode MBIAS_G1; | |
790 print MBIAS_G1 $gd1->png; | |
791 } | |
792 else { | |
793 warn "WARNING: Cannot generate read 1 M-bias plot: " , $graph1->error , "\n\n"; | |
794 } | |
795 | |
796 # my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error; | |
797 # open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n"; | |
798 # binmode MBIAS_G1; | |
799 # print MBIAS_G1 $gd1->png; | |
800 } | |
801 | |
802 if ($paired){ | |
803 | |
804 foreach my $context (qw(CpG CHG CHH)){ | |
805 @{$mbias_read2[0]} = (); | |
806 | |
807 print MBIAS "$context context (R2)\n================\n"; | |
808 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; | |
809 foreach my $pos (1..$max_length_2){ | |
810 | |
811 unless (defined $mbias_2{$context}->{$pos}->{meth}){ | |
812 $mbias_2{$context}->{$pos}->{meth} = 0; | |
813 } | |
814 unless (defined $mbias_2{$context}->{$pos}->{un}){ | |
815 $mbias_2{$context}->{$pos}->{un} = 0; | |
816 } | |
817 | |
818 my $percent = ''; | |
819 if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){ | |
820 $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) ); | |
821 } | |
822 my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth}; | |
823 | |
824 print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n"; | |
825 | |
826 push @{$mbias_read2[0]},$pos; | |
827 | |
828 if ($context eq 'CpG'){ | |
829 push @{$mbias_read2[1]},$percent; | |
830 push @{$mbias_read2[4]},$coverage; | |
831 } | |
832 elsif ($context eq 'CHG'){ | |
833 push @{$mbias_read2[2]},$percent; | |
834 push @{$mbias_read2[5]},$coverage; | |
835 } | |
836 elsif ($context eq 'CHH'){ | |
837 push @{$mbias_read2[3]},$percent; | |
838 push @{$mbias_read2[6]},$coverage; | |
839 } | |
840 } | |
841 print MBIAS "\n"; | |
842 } | |
843 | |
844 if ( $gd_graph_installed){ | |
845 | |
846 add_colour(nice_blue => [31,120,180]); | |
847 add_colour(nice_orange => [255,127,0]); | |
848 add_colour(nice_green => [51,160,44]); | |
849 add_colour(pale_blue => [153,206,227]); | |
850 add_colour(pale_orange => [253,204,138]); | |
851 add_colour(pale_green => [191,230,207]); | |
852 | |
853 $graph2->set( | |
854 x_label => 'position (bp)', | |
855 line_width => 2, | |
856 x_max_value => $max_length_1, | |
857 x_min_value => 0, | |
858 y_tick_number => 10, | |
859 y_label_skip => 2, | |
860 y1_max_value => 100, | |
861 y1_min_value => 0, | |
862 y_label_skip => 2, | |
863 y2_min_value => 0, | |
864 x_label_skip => 5, | |
865 x_label_position => 0.5, | |
866 x_tick_offset => -1, | |
867 bgclr => 'white', | |
868 transparent => 0, | |
869 two_axes => 1, | |
870 use_axis => [1,1,1,2,2,2], | |
871 legend_placement => 'RC', | |
872 legend_spacing => 6, | |
873 legend_marker_width => 24, | |
874 legend_marker_height => 18, | |
875 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], | |
876 x_label => 'position (bp)', | |
877 y1_label => '% methylation', | |
878 y2_label => '# calls', | |
879 title => 'M-bias (Read 2)', | |
880 ) or die $graph2->error; | |
881 | |
882 $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); | |
883 | |
884 ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014) | |
885 if (my $gd2 = $graph2->plot(\@mbias_read2)) { | |
886 open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n"; | |
887 binmode MBIAS_G2; | |
888 print MBIAS_G2 $gd2->png; | |
889 } | |
890 else { | |
891 warn "WARNING: Cannot generate Read 2 M-bias plot: " , $graph2->error , "\n\n"; | |
892 } | |
893 | |
894 # my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error; | |
895 # open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n"; | |
896 # binmode MBIAS_G2; | |
897 # print MBIAS_G2 $gd2->png; | |
898 | |
899 } | |
900 } | |
901 } | |
902 | |
903 sub process_commandline{ | |
904 my $help; | |
905 my $single_end; | |
906 my $paired_end; | |
907 my $ignore; | |
908 my $ignore_r2; | |
909 my $genomic_fasta; | |
910 my $full; | |
911 my $report; | |
912 my $extractor_version; | |
913 my $no_overlap; | |
914 my $merge_non_CpG; | |
915 my $vanilla; | |
916 my $output_dir; | |
917 my $no_header; | |
918 my $bedGraph; | |
919 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status | |
920 my $remove; | |
921 my $counts; | |
922 my $cytosine_report; | |
923 my $genome_folder; | |
924 my $zero; | |
925 my $CpG_only; | |
926 my $CX_context; | |
927 my $split_by_chromosome; | |
928 my $sort_size; | |
929 my $samtools_path; | |
930 my $gzip; | |
931 my $mbias_only; | |
932 my $mbias_off; | |
933 my $gazillion; | |
934 my $ample_mem; | |
935 my $include_overlap; | |
936 my $ignore_3prime; | |
937 my $ignore_3prime_r2; | |
938 my $multicore; | |
939 | |
940 my $command_line = GetOptions ('help|man' => \$help, | |
941 'p|paired-end' => \$paired_end, | |
942 's|single-end' => \$single_end, | |
943 'fasta' => \$genomic_fasta, | |
944 'ignore=i' => \$ignore, | |
945 'ignore_r2=i' => \$ignore_r2, | |
946 'comprehensive' => \$full, | |
947 'report' => \$report, | |
948 'version' => \$extractor_version, | |
949 'no_overlap' => \$no_overlap, | |
950 'merge_non_CpG' => \$merge_non_CpG, | |
951 'vanilla' => \$vanilla, | |
952 'o|output=s' => \$output_dir, | |
953 'no_header' => \$no_header, | |
954 'bedGraph' => \$bedGraph, | |
955 "cutoff=i" => \$coverage_threshold, | |
956 "remove_spaces" => \$remove, | |
957 "counts" => \$counts, | |
958 "cytosine_report" => \$cytosine_report, | |
959 'g|genome_folder=s' => \$genome_folder, | |
960 "zero_based" => \$zero, | |
961 "CX|CX_context" => \$CX_context, | |
962 "split_by_chromosome" => \$split_by_chromosome, | |
963 "buffer_size=s" => \$sort_size, | |
964 'samtools_path=s' => \$samtools_path, | |
965 'gzip' => \$gzip, | |
966 'mbias_only' => \$mbias_only, | |
967 'mbias_off' => \$mbias_off, | |
968 'gazillion|scaffolds' => \$gazillion, | |
969 'ample_memory' => \$ample_mem, | |
970 'include_overlap' => \$include_overlap, | |
971 'ignore_3prime=i' => \$ignore_3prime, | |
972 'ignore_3prime_r2=i' => \$ignore_3prime_r2, | |
973 'multicore=i' => \$multicore, | |
974 ); | |
975 | |
976 ### EXIT ON ERROR if there were errors with any of the supplied options | |
977 unless ($command_line){ | |
978 die "Please respecify command line options\n"; | |
979 } | |
980 | |
981 ### HELPFILE | |
982 if ($help){ | |
983 print_helpfile(); | |
984 exit; | |
985 } | |
986 | |
987 if ($extractor_version){ | |
988 print << "VERSION"; | |
989 | |
990 | |
991 Bismark Methylation Extractor | |
992 | |
993 Bismark Extractor Version: $version | |
994 Copyright 2010-15 Felix Krueger, Babraham Bioinformatics | |
995 www.bioinformatics.babraham.ac.uk/projects/bismark/ | |
996 | |
997 | |
998 VERSION | |
999 exit; | |
1000 } | |
1001 | |
1002 | |
1003 ### no files provided | |
1004 unless (@ARGV){ | |
1005 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; | |
1006 } | |
1007 @filenames = @ARGV; | |
1008 | |
1009 warn "\n *** Bismark methylation extractor version $version ***\n\n"; | |
1010 | |
1011 ### M-BIAS ONLY | |
1012 if ($mbias_only){ | |
1013 | |
1014 if ($mbias_off){ | |
1015 die "Options '--mbias_only' and '--mbias_off' are not compatible. Just pick one, mkay?\n"; | |
1016 } | |
1017 if ($bedGraph){ | |
1018 die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n"; | |
1019 } | |
1020 if ($cytosine_report){ | |
1021 die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n"; | |
1022 } | |
1023 if ($merge_non_CpG){ | |
1024 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n"; | |
1025 } | |
1026 if ($full){ | |
1027 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n"; | |
1028 } | |
1029 sleep(3); | |
1030 } | |
1031 | |
1032 ### PRINT A REPORT | |
1033 unless ($report){ | |
1034 $report = 1; # making this the default | |
1035 } | |
1036 | |
1037 ### OUTPUT DIR PATH | |
1038 if ($output_dir){ | |
1039 unless ($output_dir =~ /\/$/){ | |
1040 $output_dir =~ s/$/\//; | |
1041 } | |
1042 } | |
1043 else{ | |
1044 $output_dir = ''; | |
1045 } | |
1046 | |
1047 ### NO HEADER | |
1048 unless ($no_header){ | |
1049 $no_header = 0; | |
1050 } | |
1051 | |
1052 ### OLD (VANILLA) OUTPUT FORMAT | |
1053 unless ($vanilla){ | |
1054 $vanilla = 0; | |
1055 } | |
1056 | |
1057 if ($single_end){ | |
1058 $paired_end = 0; ### SINGLE END ALIGNMENTS | |
1059 } | |
1060 elsif ($paired_end){ | |
1061 $single_end = 0; ### PAIRED-END ALIGNMENTS | |
1062 } | |
1063 else{ | |
1064 | |
1065 ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header | |
1066 | |
1067 if ($vanilla){ | |
1068 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n"; | |
1069 } | |
1070 else{ # SAM/BAM format | |
1071 | |
1072 my $file = $filenames[0]; | |
1073 warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1); | |
1074 | |
1075 ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file | |
1076 if ($file =~ /\.gz$/){ | |
1077 open (DETERMINE,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n"; | |
1078 } | |
1079 elsif ($file =~ /\.cram$/ || $file =~ /\.bam$/ || isBam($file) ){ ### this would allow to read BAM files that do not end in *.bam | |
1080 open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM/CRAM file $file: $!\n"; | |
1081 } | |
1082 else{ | |
1083 open (DETERMINE,$file) or die "Unable to read from $file: $!\n"; | |
1084 } | |
1085 | |
1086 while (<DETERMINE>){ | |
1087 last unless (/^\@/); | |
1088 if ($_ =~ /^\@PG/){ | |
1089 # warn "found the \@PG line:\n"; | |
1090 # warn "$_"; | |
1091 | |
1092 if ($_ =~ /\s+-1\s+/ and $_ =~ /\s+-2\s+/){ | |
1093 warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1); | |
1094 $paired_end = 1; | |
1095 $single_end = 0; | |
1096 } | |
1097 else{ | |
1098 warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1); | |
1099 $paired_end = 0; | |
1100 $single_end = 1; | |
1101 } | |
1102 } | |
1103 } | |
1104 | |
1105 # close DETERMINE or warn $!; # this always throws an error anyway... | |
1106 | |
1107 } | |
1108 } | |
1109 | |
1110 ### IGNORING <INT> 5' END | |
1111 # bases at the start of the read when processing the methylation call string | |
1112 unless ($ignore){ | |
1113 $ignore = 0; | |
1114 } | |
1115 | |
1116 if (defined $ignore_r2){ | |
1117 die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end); | |
1118 } | |
1119 else{ | |
1120 $ignore_r2 = 0; | |
1121 } | |
1122 | |
1123 ### IGNORING <INT> 3' END | |
1124 # bases at the end of the read when processing the methylation call string | |
1125 unless ($ignore_3prime){ | |
1126 $ignore_3prime = 0; | |
1127 } | |
1128 | |
1129 if (defined $ignore_3prime_r2){ | |
1130 die "You can only specify --ignore_3prime_r2 for paired-end result files\n" unless ($paired_end); | |
1131 } | |
1132 else{ | |
1133 $ignore_3prime_r2 = 0; | |
1134 } | |
1135 | |
1136 | |
1137 ### NO OVERLAP | |
1138 ### --no_overlap is the default (as of version 0.12.6), unless someone explicitly asks to include overlaps | |
1139 if ($include_overlap){ | |
1140 die "The option '--include_overlap' can only be specified for paired-end input!\n" unless ($paired_end); | |
1141 warn "Setting option '--inlcude_overlap' for paired-end data (user-defined)\n\n"; | |
1142 $no_overlap = 0; | |
1143 } | |
1144 else{ # default | |
1145 if ($paired_end){ | |
1146 warn "Setting option '--no_overlap' since this is (normally) the right thing to do for paired-end data\n\n"; | |
1147 $no_overlap = 1; | |
1148 } | |
1149 } | |
1150 | |
1151 ### COMPREHENSIVE OUTPUT | |
1152 unless ($full){ | |
1153 $full = 0; | |
1154 } | |
1155 | |
1156 ### MERGE NON-CpG context | |
1157 unless ($merge_non_CpG){ | |
1158 $merge_non_CpG = 0; | |
1159 } | |
1160 | |
1161 ### remove white spaces in read ID (needed for sorting using the sort command | |
1162 unless ($remove){ | |
1163 $remove = 0; | |
1164 } | |
1165 | |
1166 ### COVERAGE THRESHOLD FOR bedGraph OUTPUT | |
1167 if (defined $coverage_threshold){ | |
1168 unless ($coverage_threshold > 0){ | |
1169 die "Please select a coverage greater than 0 (positive integers only)\n"; | |
1170 } | |
1171 } | |
1172 else{ | |
1173 $coverage_threshold = 1; | |
1174 } | |
1175 | |
1176 ### SORT buffer size | |
1177 if (defined $sort_size){ | |
1178 unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){ | |
1179 die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n"; | |
1180 } | |
1181 } | |
1182 else{ | |
1183 $sort_size = '2G'; | |
1184 } | |
1185 | |
1186 if ($zero){ | |
1187 die "Option '--zero' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); | |
1188 } | |
1189 | |
1190 if ($CX_context){ | |
1191 die "Option '--CX_context' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); | |
1192 } | |
1193 else{ | |
1194 $CX_context = 0; | |
1195 } | |
1196 | |
1197 unless ($counts){ | |
1198 $counts = 1; # counts will always be set | |
1199 } | |
1200 | |
1201 if ($cytosine_report){ | |
1202 | |
1203 ### GENOME folder | |
1204 if ($genome_folder){ | |
1205 unless ($genome_folder =~/\/$/){ | |
1206 $genome_folder =~ s/$/\//; | |
1207 } | |
1208 } | |
1209 else{ | |
1210 die "Please specify a genome folder to proceed (full path only)\n"; | |
1211 } | |
1212 | |
1213 unless ($bedGraph){ | |
1214 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; | |
1215 $bedGraph = 1; | |
1216 } | |
1217 unless ($counts){ | |
1218 # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; | |
1219 $counts = 1; | |
1220 } | |
1221 warn "\n"; | |
1222 } | |
1223 | |
1224 ### PATH TO SAMTOOLS | |
1225 if (defined $samtools_path){ | |
1226 # if Samtools was specified as full command | |
1227 if ($samtools_path =~ /samtools$/){ | |
1228 if (-e $samtools_path){ | |
1229 # Samtools executable found | |
1230 } | |
1231 else{ | |
1232 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
1233 } | |
1234 } | |
1235 else{ | |
1236 unless ($samtools_path =~ /\/$/){ | |
1237 $samtools_path =~ s/$/\//; | |
1238 } | |
1239 $samtools_path .= 'samtools'; | |
1240 if (-e $samtools_path){ | |
1241 # Samtools executable found | |
1242 } | |
1243 else{ | |
1244 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
1245 } | |
1246 } | |
1247 } | |
1248 # Check whether Samtools is in the PATH if no path was supplied by the user | |
1249 else{ | |
1250 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH | |
1251 $samtools_path = `which samtools`; | |
1252 chomp $samtools_path; | |
1253 } | |
1254 } | |
1255 | |
1256 unless (defined $samtools_path){ | |
1257 $samtools_path = ''; | |
1258 } | |
1259 | |
1260 | |
1261 if ($gazillion){ | |
1262 if ($ample_mem){ | |
1263 die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n"; | |
1264 } | |
1265 } | |
1266 | |
1267 if (defined $multicore){ | |
1268 unless ($multicore > 0){ | |
1269 die "Core usage needs to be set to 1 or more (currently selected $multicore). Please respecify!\n"; | |
1270 } | |
1271 if ($multicore > 20){ | |
1272 warn "Core usage currently set to more than 20 threads. Let's see how this goes... (set value: $multicore)\n\n"; | |
1273 } | |
1274 } | |
1275 else{ | |
1276 $multicore = 1; # default. Single-thread mode | |
1277 warn "Setting core usage to single-threaded (default). Consider using --multicore <int> to speed up the extraction process.\n\n"; | |
1278 } | |
1279 | |
1280 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore); | |
1281 } | |
1282 | |
1283 | |
1284 sub test_positional_sorting{ | |
1285 | |
1286 my $filename = shift; | |
1287 | |
1288 print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t"; | |
1289 sleep(1); | |
1290 | |
1291 if ($filename =~ /\.gz$/) { | |
1292 open (TEST,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n"; | |
1293 } | |
1294 elsif ($filename =~ /bam$/ || $filename =~ /cram$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam | |
1295 if ($samtools_path){ | |
1296 open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM/CRAM file $filename: $!\n"; | |
1297 } | |
1298 else{ | |
1299 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; | |
1300 } | |
1301 } | |
1302 else { | |
1303 open (TEST,$filename) or die "Can't open file $filename: $!\n"; | |
1304 } | |
1305 | |
1306 my $count = 0; | |
1307 | |
1308 while (<TEST>) { | |
1309 if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted) | |
1310 if (/^\@SO/) { | |
1311 die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n"; | |
1312 } | |
1313 next; | |
1314 } | |
1315 $count++; | |
1316 | |
1317 last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID | |
1318 | |
1319 my ($id_1) = (split (/\t/)); | |
1320 | |
1321 ### reading the next line which should be read 2 | |
1322 $_ = <TEST>; | |
1323 my ($id_2) = (split (/\t/)); | |
1324 last unless ($id_2); | |
1325 ++$count; | |
1326 | |
1327 if ($id_1 eq $id_2){ | |
1328 ### ids are the same | |
1329 next; | |
1330 } | |
1331 else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first | |
1332 my $id_1_trunc = $id_1; | |
1333 $id_1_trunc =~ s/\/1$//; | |
1334 my $id_2_trunc = $id_2; | |
1335 $id_2_trunc =~ s/\/2$//; | |
1336 | |
1337 unless ($id_1_trunc eq $id_2_trunc){ | |
1338 die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n"; | |
1339 } | |
1340 } | |
1341 } | |
1342 # close TEST or die $!; somehow fails on our cluster... | |
1343 ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other) | |
1344 warn "...passed!\n"; | |
1345 sleep(1); | |
1346 | |
1347 } | |
1348 | |
1349 sub process_Bismark_results_file{ | |
1350 | |
1351 my $filename = shift; | |
1352 my $report_filename = open_output_filehandles($filename); | |
1353 | |
1354 ### disabling buffering so we don't run into problems with half written out lines... | |
1355 foreach my $fh (keys %fhs){ | |
1356 if ($fh =~ /^[1230]$/) { | |
1357 foreach my $context (keys %{$fhs{$fh}}) { | |
1358 select($fhs{$fh}->{$context}); | |
1359 $|++; | |
1360 } | |
1361 } | |
1362 else{ | |
1363 select($fhs{$fh}); | |
1364 $|++; | |
1365 } | |
1366 } | |
1367 select(STDOUT); | |
1368 | |
1369 ################################################ | |
1370 ################################################ | |
1371 ### multi-process handling | |
1372 ### | |
1373 | |
1374 my $offset = 1; | |
1375 my $process_id; | |
1376 my @pids; | |
1377 if ($multicore > 1){ | |
1378 | |
1379 until ($offset == $multicore){ | |
1380 # warn "multicore: $multicore\noffset: $offset\n"; | |
1381 my $fork = fork; | |
1382 | |
1383 if (defined $fork){ | |
1384 if ($fork != 0){ | |
1385 $process_id = $fork; | |
1386 push @pids, $process_id; | |
1387 if ($offset < $multicore){ | |
1388 ++$offset; | |
1389 # warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n"; | |
1390 } | |
1391 else{ | |
1392 # warn "Reached the number of maximum multicores. Proceeeding to processing...\n"; | |
1393 } | |
1394 } | |
1395 elsif ($fork == 0){ | |
1396 # warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n"; | |
1397 $process_id = $fork; | |
1398 last; | |
1399 } | |
1400 } | |
1401 else{ | |
1402 die "Forking unsuccessful. Proceeding using a single thread only\n"; | |
1403 } | |
1404 } | |
1405 | |
1406 # warn "\nThe thread identity\n===================\n"; | |
1407 if ($process_id){ | |
1408 # print "I am the parent process. My children are called:\n"; | |
1409 # print join ("\t",@pids),"\n"; | |
1410 # print "I am going to process the following line count: $offset\n\n"; | |
1411 } | |
1412 elsif($process_id == 0){ | |
1413 # warn "I am a child process: Process ID: $process_id\n"; | |
1414 # warn "I am going to process the following line count: $offset\n\n"; | |
1415 } | |
1416 else{ | |
1417 die "Process ID was: $process_id\n"; | |
1418 } | |
1419 } | |
1420 else{ | |
1421 # warn "Single-core mode: setting pid to 1\n"; | |
1422 $process_id = 1; | |
1423 } | |
1424 | |
1425 ################################################ | |
1426 ################################################ | |
1427 if ($process_id){ | |
1428 warn "Now reading in Bismark result file $filename\n"; | |
1429 } | |
1430 else{ | |
1431 warn "\nNow reading in Bismark result file $filename\n\n"; | |
1432 } | |
1433 | |
1434 if ($filename =~ /\.gz$/) { | |
1435 open (IN,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n"; | |
1436 } | |
1437 elsif ($filename =~ /bam$/ || $filename =~ /cram$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam | |
1438 if ($samtools_path){ | |
1439 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM/CRAM file $filename: $!\n"; | |
1440 } | |
1441 else{ | |
1442 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; | |
1443 } | |
1444 } | |
1445 else { | |
1446 open (IN,$filename) or die "Can't open file $filename: $!\n"; | |
1447 } | |
1448 | |
1449 ### Vanilla and SAM output need to read different numbers of header lines | |
1450 if ($vanilla) { | |
1451 my $bismark_version = <IN>; ## discarding the Bismark version info | |
1452 chomp $bismark_version; | |
1453 $bismark_version =~ s/\r//; # replaces \r line feed | |
1454 $bismark_version =~ s/Bismark version: //; | |
1455 if ($bismark_version =~ /^\@/) { | |
1456 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n"; | |
1457 sleep (1); | |
1458 } | |
1459 | |
1460 unless ($version eq $bismark_version){ | |
1461 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n"; | |
1462 } | |
1463 } else { | |
1464 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly. | |
1465 # We are reading from it further down | |
1466 } | |
1467 | |
1468 my $methylation_call_strings_processed = 0; | |
1469 my $line_count = 0; | |
1470 | |
1471 ### proceeding differently now for single-end or paired-end Bismark files | |
1472 | |
1473 ### PROCESSING SINGLE-END RESULT FILES | |
1474 if ($single) { | |
1475 | |
1476 ### also proceeding differently now for SAM format or vanilla Bismark format files | |
1477 if ($vanilla) { # old vanilla Bismark output format | |
1478 while (<IN>) { | |
1479 ++$line_count; | |
1480 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
1481 | |
1482 if ( ($line_count - $offset)%$multicore == 0){ | |
1483 # warn "line count: $line_count\noffset: $offset\n"; | |
1484 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; | |
1485 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; | |
1486 } | |
1487 else{ | |
1488 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; | |
1489 next; | |
1490 } | |
1491 | |
1492 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) | |
1493 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9]; | |
1494 | |
1495 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or | |
1496 ### last position | |
1497 chomp $genome_conversion; | |
1498 | |
1499 my $index; | |
1500 if ($meth_call) { | |
1501 | |
1502 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand | |
1503 $index = 0; | |
1504 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand | |
1505 $index = 1; | |
1506 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand | |
1507 $index = 3; | |
1508 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand | |
1509 $index = 2; | |
1510 } else { | |
1511 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; | |
1512 } | |
1513 | |
1514 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> | |
1515 if ($ignore) { | |
1516 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); | |
1517 | |
1518 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! | |
1519 if ($strand eq '+') { | |
1520 $start += $ignore; | |
1521 } | |
1522 elsif ($strand eq '-') { | |
1523 $start += length($meth_call)-1; ## $meth_call is already shortened! | |
1524 } | |
1525 else { | |
1526 die "Alignment did not have proper strand information: $strand\n"; | |
1527 } | |
1528 } | |
1529 | |
1530 ### Clipping off the last <int> number of bases from the methylation call string as specified with --ignore_3prime <int> | |
1531 if ($ignore_3prime) { | |
1532 | |
1533 $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime); | |
1534 | |
1535 ### If we are clipping off some bases at the end we need to adjust the end position of the alignments accordingly | |
1536 if ($strand eq '+') { | |
1537 # clipping the 3' end does not affect the starting position # ignore 5' has already been taken care of, if relevant at all | |
1538 } | |
1539 elsif ($strand eq '-') { | |
1540 # here we need to discriminate if the start has been adjusted because of --ignore or not | |
1541 if ($ignore){ | |
1542 # position adjusted already, and because of this 3' trimming is irrelevant for the start position | |
1543 } | |
1544 else{ | |
1545 # Here we need to add the length ignore_3prime to the read starting position, adjustment of the start position will take place later in the methylation extraction step | |
1546 $start += $ignore_3prime; | |
1547 } | |
1548 } | |
1549 else { | |
1550 die "Alignment did not have proper strand information: $strand\n"; | |
1551 } | |
1552 } | |
1553 ### just as a comment, if --ignore has not been specified the starting position of reverse reads is adjusted later at the methylation extraction stage | |
1554 | |
1555 ### printing out the methylation state of every C in the read | |
1556 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); | |
1557 | |
1558 ++$methylation_call_strings_processed; # 1 per single-end result | |
1559 } | |
1560 } | |
1561 } else { # processing single-end SAM format (default) | |
1562 while (<IN>) { | |
1563 chomp; | |
1564 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
1565 if (/^\@/) { # skipping header lines (starting with @) | |
1566 warn "skipping SAM header line:\t$_\n" unless ($process_id == 0); | |
1567 next; | |
1568 } | |
1569 | |
1570 ++$line_count; | |
1571 # warn "$line_count\n"; | |
1572 warn "Processed lines: $line_count\n" if ($line_count%500000 == 0); | |
1573 | |
1574 if ( ($line_count - $offset)%$multicore == 0){ | |
1575 # warn "line count: $line_count\noffset: $offset\n"; | |
1576 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; | |
1577 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; | |
1578 } | |
1579 else{ | |
1580 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; | |
1581 next; | |
1582 } | |
1583 | |
1584 # example read in SAM format | |
1585 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT | |
1586 ### | |
1587 | |
1588 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
1589 # < 0.7.6 $meth_call =~ s/^XM:Z://; | |
1590 # < 0.7.6 $read_conversion =~ s/^XR:Z://; | |
1591 # < 0.7.6 $genome_conversion =~ s/^XG:Z://; | |
1592 | |
1593 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5]; | |
1594 | |
1595 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
1596 my $meth_call; ### Thanks to Zachary Zeno for this solution | |
1597 my $read_conversion; | |
1598 my $genome_conversion; | |
1599 | |
1600 while ( /(XM|XR|XG):Z:([^\t]+)/g ) { | |
1601 my $tag = $1; | |
1602 my $value = $2; | |
1603 | |
1604 if ($tag eq "XM") { | |
1605 $meth_call = $value; | |
1606 $meth_call =~ s/\r//; | |
1607 } elsif ($tag eq "XR") { | |
1608 $read_conversion = $value; | |
1609 $read_conversion =~ s/\r//; | |
1610 } elsif ($tag eq "XG") { | |
1611 $genome_conversion = $value; | |
1612 $genome_conversion =~ s/\r//; | |
1613 } | |
1614 } | |
1615 | |
1616 my $strand; | |
1617 # warn "$meth_call\n$read_conversion\n$genome_conversion\n"; | |
1618 | |
1619 my $index; | |
1620 if ($meth_call) { | |
1621 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand | |
1622 $index = 0; | |
1623 $strand = '+'; | |
1624 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand | |
1625 $index = 1; | |
1626 $strand = '-'; | |
1627 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand | |
1628 $index = 2; | |
1629 $strand = '+'; | |
1630 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand | |
1631 $index = 3; | |
1632 $strand = '-'; | |
1633 } else { | |
1634 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; | |
1635 } | |
1636 | |
1637 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output | |
1638 if ($strand eq '-') { | |
1639 $meth_call = reverse $meth_call; | |
1640 } | |
1641 # warn "\n$meth_call\n"; | |
1642 | |
1643 ### IGNORE 5 PRIME: Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int> | |
1644 if ($ignore) { | |
1645 # warn "\n\n$meth_call\n"; | |
1646 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); | |
1647 # warn "$meth_call\n";sleep(1); | |
1648 | |
1649 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
1650 | |
1651 my @len = split (/\D+/,$cigar); # storing the length per operation | |
1652 my @ops = split (/\d+/,$cigar); # storing the operation | |
1653 shift @ops; # remove the empty first element | |
1654 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
1655 | |
1656 my @comp_cigar; # building an array with all CIGAR operations | |
1657 foreach my $index (0..$#len) { | |
1658 foreach (1..$len[$index]) { | |
1659 # print "$ops[$index]"; | |
1660 push @comp_cigar, $ops[$index]; | |
1661 } | |
1662 } | |
1663 # print "original CIGAR: $cigar\n"; | |
1664 # print "original CIGAR: @comp_cigar\n"; | |
1665 | |
1666 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! | |
1667 if ($strand eq '+') { | |
1668 | |
1669 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions | |
1670 my $I_count = 0; | |
1671 | |
1672 for (1..$ignore) { | |
1673 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start | |
1674 # print "$_ deleted $op\n"; | |
1675 | |
1676 while ($op eq 'D') { # repeating this for deletions (D) | |
1677 $D_count++; | |
1678 $op = shift @comp_cigar; | |
1679 # print "$_ deleted $op\n"; | |
1680 } | |
1681 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
1682 $I_count++; | |
1683 } | |
1684 } | |
1685 $start += $ignore + $D_count - $I_count; | |
1686 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n"; | |
1687 } | |
1688 elsif ($strand eq '-') { | |
1689 | |
1690 for (1..$ignore) { | |
1691 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
1692 while ($op eq 'D') { # repeating this for deletions (D) | |
1693 $op = pop @comp_cigar; | |
1694 } | |
1695 } | |
1696 | |
1697 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR | |
1698 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence | |
1699 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position | |
1700 foreach (@comp_cigar) { | |
1701 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); | |
1702 } | |
1703 $start += $MD_count - 1; | |
1704 } | |
1705 | |
1706 ### reconstituting shortened CIGAR string | |
1707 my $new_cigar; | |
1708 my $count = 0; | |
1709 my $last_op; | |
1710 # print "ignore adjusted: @comp_cigar\n"; | |
1711 foreach my $op (@comp_cigar) { | |
1712 unless (defined $last_op){ | |
1713 $last_op = $op; | |
1714 ++$count; | |
1715 next; | |
1716 } | |
1717 if ($last_op eq $op) { | |
1718 ++$count; | |
1719 } else { | |
1720 $new_cigar .= "$count$last_op"; | |
1721 $last_op = $op; | |
1722 $count = 1; | |
1723 } | |
1724 } | |
1725 $new_cigar .= "$count$last_op"; # appending the last operation and count | |
1726 $cigar = $new_cigar; | |
1727 # print "ignore adjusted scalar: $cigar\n"; | |
1728 } | |
1729 | |
1730 ####################### | |
1731 ### INGORE 3' END ### | |
1732 ####################### | |
1733 | |
1734 # Clipping off the last <int> number of bases from the methylation call string as specified with --ignore_3prime <int> | |
1735 if ($ignore_3prime) { | |
1736 # warn "$meth_call\n"; | |
1737 $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime); | |
1738 # warn "$meth_call\n";sleep(1); | |
1739 | |
1740 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
1741 | |
1742 my @len = split (/\D+/,$cigar); # storing the length per operation | |
1743 my @ops = split (/\d+/,$cigar); # storing the operation | |
1744 shift @ops; # remove the empty first element | |
1745 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
1746 | |
1747 my @comp_cigar; # building an array with all CIGAR operations | |
1748 foreach my $index (0..$#len) { | |
1749 foreach (1..$len[$index]) { | |
1750 # print "$ops[$index]"; | |
1751 push @comp_cigar, $ops[$index]; | |
1752 } | |
1753 } | |
1754 | |
1755 # print "original CIGAR: $cigar\n"; | |
1756 # print join ("",@comp_cigar),"\n"; | |
1757 | |
1758 ### If we are clipping off some bases at the end we might have to adjust the start position of the alignments accordingly | |
1759 if ($strand eq '+') { | |
1760 | |
1761 ### clipping the 3' end does not affect the starting position of forward strand alignments | |
1762 # ignore 5' has already been taken care of at this stage, if relevant at all | |
1763 | |
1764 for (1..$ignore_3prime) { | |
1765 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations from the end | |
1766 # print join ("",@comp_cigar),"\n"; | |
1767 # print "$_ deleted $op from 3' end\n"; | |
1768 while ($op eq 'D') { # repeating this for deletions (D) | |
1769 $op = pop @comp_cigar; | |
1770 # print join ("",@comp_cigar),"\n"; | |
1771 # print "$_ deleted $op from 3' end\n"; | |
1772 } | |
1773 } | |
1774 # print "Final truncated CIGAR string:\n"; | |
1775 # print join ("",@comp_cigar),"\n"; | |
1776 # $start += $ignore + $D_count - $I_count; | |
1777 # print "start $start\t ignore_3prime: $ignore_3prime\t D count: $D_count I_count: $I_count\n"; | |
1778 } | |
1779 elsif ($strand eq '-') { | |
1780 | |
1781 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions | |
1782 my $I_count = 0; | |
1783 | |
1784 for (1..$ignore_3prime) { | |
1785 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array | |
1786 while ($op eq 'D') { # repeating this for deletions (D) | |
1787 $D_count++; | |
1788 $op = shift @comp_cigar; | |
1789 } | |
1790 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
1791 $I_count++; | |
1792 } | |
1793 | |
1794 } | |
1795 | |
1796 # here we need to discriminate if the start has been adjusted because of --ignore or not | |
1797 if ($ignore){ | |
1798 # the start position has already been modified for --ignore already, so we don't have to adjust the position again now | |
1799 } | |
1800 else{ | |
1801 # Here we need to add the length ignore_3prime to the read starting position | |
1802 # adjustment of the true starting position of this reverse read will take place later in the methylation extraction step | |
1803 $start += $ignore_3prime + $D_count - $I_count; | |
1804 } | |
1805 | |
1806 } | |
1807 | |
1808 ### reconstituting shortened CIGAR string | |
1809 my $new_cigar; | |
1810 my $count = 0; | |
1811 my $last_op; | |
1812 # print "ignore_3prime adjusted:\n"; print join ("",@comp_cigar),"\n"; | |
1813 foreach my $op (@comp_cigar) { | |
1814 unless (defined $last_op){ | |
1815 $last_op = $op; | |
1816 ++$count; | |
1817 next; | |
1818 } | |
1819 if ($last_op eq $op) { | |
1820 ++$count; | |
1821 } else { | |
1822 $new_cigar .= "$count$last_op"; | |
1823 $last_op = $op; | |
1824 $count = 1; | |
1825 } | |
1826 } | |
1827 $new_cigar .= "$count$last_op"; # appending the last operation and count | |
1828 $cigar = $new_cigar; | |
1829 # print "ignore_3prime adjusted scalar: $cigar\n"; | |
1830 } | |
1831 } | |
1832 ### printing out the methylation state of every C in the read | |
1833 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar); | |
1834 | |
1835 ++$methylation_call_strings_processed; # 1 per single-end result | |
1836 } | |
1837 } | |
1838 } | |
1839 | |
1840 ### PROCESSING PAIRED-END RESULT FILES | |
1841 elsif ($paired) { | |
1842 | |
1843 ### proceeding differently now for SAM format or vanilla Bismark format files | |
1844 if ($vanilla) { # old vanilla Bismark paired-end output format | |
1845 while (<IN>) { | |
1846 ++$line_count; | |
1847 warn "processed line: $line_count\n" if ($line_count%500000 == 0); | |
1848 | |
1849 if ( ($line_count - $offset)%$multicore == 0){ | |
1850 # warn "line count: $line_count\noffset: $offset\n"; | |
1851 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; | |
1852 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; | |
1853 } | |
1854 else{ | |
1855 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; | |
1856 next; | |
1857 } | |
1858 | |
1859 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) | |
1860 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13]; | |
1861 | |
1862 my $index; | |
1863 chomp $genome_conversion; | |
1864 | |
1865 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { | |
1866 $index = 0; ## this is OT | |
1867 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { | |
1868 $index = 2; ## this is CTOB!!! | |
1869 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { | |
1870 $index = 1; ## this is CTOT!!! | |
1871 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { | |
1872 $index = 3; ## this is OB | |
1873 } else { | |
1874 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | |
1875 } | |
1876 | |
1877 if ($meth_call_1 and $meth_call_2) { | |
1878 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' | |
1879 | |
1880 ### IGNORE FROM 5' END | |
1881 if ($ignore) { | |
1882 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); | |
1883 | |
1884 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified | |
1885 $start_read_1 += $ignore; | |
1886 } | |
1887 if ($ignore_r2) { | |
1888 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); | |
1889 | |
1890 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified | |
1891 $end_read_2 -= $ignore_r2; | |
1892 } | |
1893 | |
1894 ### IGNORE 3' END | |
1895 | |
1896 ### Clipping off the last <int> number of bases from the methylation call string of Read 1 as specified with --ignore_3prime <int> | |
1897 if ($ignore_3prime) { | |
1898 $meth_call_1 = substr($meth_call_1,0, length($meth_call_1) - $ignore_3prime); | |
1899 # we don't have to adjust the position now since the shortened methylation call will be fine, see below | |
1900 } | |
1901 ### Clipping off the last <int> number of bases from the methylation call string of Read 2 as specified with --ignore_3prime_r2 <int> | |
1902 if ($ignore_3prime_r2) { | |
1903 $meth_call_2 = substr($meth_call_2,0, length($meth_call_2) - $ignore_3prime_r2); | |
1904 # we don't have to adjust the position now since the shortened methylation call will be fine, see below | |
1905 } | |
1906 | |
1907 my $end_read_1; | |
1908 my $start_read_2; | |
1909 | |
1910 if ($strand eq '+') { | |
1911 | |
1912 $end_read_1 = $start_read_1 + length($meth_call_1) - 1; | |
1913 $start_read_2 = $end_read_2 - length($meth_call_2) + 1; | |
1914 | |
1915 ## we first pass the first read which is in + orientation on the forward strand | |
1916 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity | |
1917 | |
1918 # we next pass the second read which is in - orientation on the reverse strand | |
1919 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | |
1920 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2); | |
1921 } | |
1922 else { | |
1923 | |
1924 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! | |
1925 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! | |
1926 | |
1927 ## we first pass the first read which is in - orientation on the reverse strand | |
1928 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1); | |
1929 | |
1930 # we next pass the second read which is in + orientation on the forward strand | |
1931 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | |
1932 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2); | |
1933 } | |
1934 | |
1935 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | |
1936 } | |
1937 } | |
1938 } | |
1939 else { # Bismark paired-end BAM/SAM output format (default) | |
1940 while (<IN>) { | |
1941 chomp; | |
1942 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
1943 if (/^\@/) { # skipping header lines (starting with @) | |
1944 warn "skipping SAM header line:\t$_\n" unless ($process_id == 0); # no additional warnings for child procesess | |
1945 next; | |
1946 } | |
1947 | |
1948 ++$line_count; | |
1949 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
1950 | |
1951 if ( ($line_count - $offset)%$multicore == 0){ | |
1952 # warn "line count: $line_count\noffset: $offset\n"; | |
1953 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; | |
1954 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; | |
1955 } | |
1956 else{ | |
1957 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; | |
1958 $_ = <IN>; # reading and skipping another line since this is the paired-end read | |
1959 next; | |
1960 } | |
1961 | |
1962 # example paired-end reads in SAM format (2 consecutive lines) | |
1963 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT | |
1964 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT | |
1965 | |
1966 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
1967 my $meth_call_1; | |
1968 my $first_read_conversion; | |
1969 my $genome_conversion; | |
1970 | |
1971 while ( /(XM|XR|XG):Z:([^\t]+)/g ) { | |
1972 my $tag = $1; | |
1973 my $value = $2; | |
1974 | |
1975 if ($tag eq "XM") { | |
1976 $meth_call_1 = $value; | |
1977 $meth_call_1 =~ s/\r//; | |
1978 } elsif ($tag eq "XR") { | |
1979 $first_read_conversion = $value; | |
1980 $first_read_conversion =~ s/\r//; | |
1981 } elsif ($tag eq "XG") { | |
1982 $genome_conversion = $value; | |
1983 $genome_conversion =~ s/\r//; | |
1984 } | |
1985 } | |
1986 | |
1987 $_ = <IN>; # reading in the paired read | |
1988 chomp; | |
1989 | |
1990 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression | |
1991 | |
1992 my $meth_call_2; | |
1993 my $second_read_conversion; | |
1994 | |
1995 while ( /(XM|XR):Z:([^\t]+)/g ) { | |
1996 my $tag = $1; | |
1997 my $value = $2; | |
1998 | |
1999 if ($tag eq "XM") { | |
2000 $meth_call_2 = $value; | |
2001 $meth_call_2 =~ s/\r//; | |
2002 } elsif ($tag eq "XR") { | |
2003 $second_read_conversion = $value; | |
2004 $second_read_conversion = s/\r//; | |
2005 } | |
2006 } | |
2007 | |
2008 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://; | |
2009 # chomp $genome_conversion; # in case it captured a new line character # already removed | |
2010 | |
2011 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n"; | |
2012 | |
2013 my $index; | |
2014 my $strand; | |
2015 | |
2016 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { | |
2017 $index = 0; ## this is OT | |
2018 $strand = '+'; | |
2019 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { | |
2020 $index = 1; ## this is CTOT | |
2021 $strand = '-'; | |
2022 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { | |
2023 $index = 2; ## this is CTOB | |
2024 $strand = '+'; | |
2025 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { | |
2026 $index = 3; ## this is OB | |
2027 $strand = '-'; | |
2028 } else { | |
2029 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | |
2030 } | |
2031 | |
2032 ### reversing the methylation call of the read that was reverse-complemented | |
2033 if ($strand eq '+') { | |
2034 $meth_call_2 = reverse $meth_call_2; | |
2035 } else { | |
2036 $meth_call_1 = reverse $meth_call_1; | |
2037 } | |
2038 # warn "\n$meth_call_1\n$meth_call_2\n"; | |
2039 | |
2040 if ($meth_call_1 and $meth_call_2) { | |
2041 | |
2042 my $end_read_1; | |
2043 | |
2044 ### READ 1 | |
2045 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation | |
2046 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation | |
2047 shift @ops_1; # remove the empty first element | |
2048 | |
2049 die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1); | |
2050 | |
2051 my @comp_cigar_1; # building an array with all CIGAR operations | |
2052 foreach my $index (0..$#len_1) { | |
2053 foreach (1..$len_1[$index]) { | |
2054 # print "$ops_1[$index]"; | |
2055 push @comp_cigar_1, $ops_1[$index]; | |
2056 } | |
2057 } | |
2058 # print "original CIGAR read 1: $cigar_1\n"; | |
2059 # print "original CIGAR read 1: @comp_cigar_1\n"; | |
2060 | |
2061 ### READ 2 | |
2062 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation | |
2063 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation | |
2064 shift @ops_2; # remove the empty first element | |
2065 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); | |
2066 my @comp_cigar_2; # building an array with all CIGAR operations for read 2 | |
2067 foreach my $index (0..$#len_2) { | |
2068 foreach (1..$len_2[$index]) { | |
2069 # print "$ops_2[$index]"; | |
2070 push @comp_cigar_2, $ops_2[$index]; | |
2071 } | |
2072 } | |
2073 # print "original CIGAR read 2: $cigar_2\n"; | |
2074 # print "original CIGAR read 2: @comp_cigar_2\n"; | |
2075 | |
2076 ################################## | |
2077 ### IGNORE BASES FROM 5' END ### | |
2078 ################################## | |
2079 | |
2080 if ($ignore) { | |
2081 ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore <int>' for read 1 | |
2082 ### the methylation calls have already been reversed where necessary | |
2083 | |
2084 if ( (length($meth_call_1) - $ignore) <= 0){ | |
2085 # next; # skipping this read entirely if the read is shorter than the portion to be ignored | |
2086 $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored | |
2087 } | |
2088 else { | |
2089 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); | |
2090 | |
2091 if ($strand eq '+') { | |
2092 | |
2093 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start | |
2094 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions | |
2095 my $I_count_1 = 0; | |
2096 | |
2097 for (1..$ignore) { | |
2098 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start | |
2099 # print "$_ deleted $op\n"; | |
2100 | |
2101 while ($op eq 'D') { # repeating this for deletions (D) | |
2102 $D_count_1++; | |
2103 $op = shift @comp_cigar_1; | |
2104 # print "$_ deleted $op\n"; | |
2105 } | |
2106 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
2107 $I_count_1++; | |
2108 } | |
2109 } | |
2110 | |
2111 $start_read_1 += $ignore + $D_count_1 - $I_count_1; | |
2112 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; | |
2113 | |
2114 # the start position of reads mapping to the reverse strand is being adjusted further below | |
2115 } | |
2116 elsif ($strand eq '-') { | |
2117 | |
2118 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back | |
2119 for (1..$ignore) { | |
2120 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
2121 while ($op eq 'D') { # repeating this for deletions (D) | |
2122 $op = pop @comp_cigar_1; | |
2123 } | |
2124 } | |
2125 # the start position of reads mapping to the reverse strand is being adjusted further below | |
2126 | |
2127 } | |
2128 } | |
2129 } | |
2130 | |
2131 if ($ignore_r2) { | |
2132 ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore_r2 <int>' for read 2 | |
2133 ### the methylation calls have already been reversed where necessary | |
2134 | |
2135 if ( (length($meth_call_2) - $ignore_r2) <= 0){ | |
2136 # next; # skipping this read entirely if the read is shorter than the portion to be ignored # this would skip the entire read pair! | |
2137 $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored | |
2138 } | |
2139 else { | |
2140 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); | |
2141 | |
2142 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly | |
2143 | |
2144 if ($strand eq '+') { | |
2145 | |
2146 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back | |
2147 | |
2148 for (1..$ignore_r2) { | |
2149 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array | |
2150 while ($op eq 'D') { # repeating this for deletions (D) | |
2151 $op = pop @comp_cigar_2; | |
2152 } | |
2153 } | |
2154 # the start position of reads mapping to the reverse strand is being adjusted further below | |
2155 } | |
2156 elsif ($strand eq '-') { | |
2157 | |
2158 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start | |
2159 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions | |
2160 my $I_count_2 = 0; | |
2161 | |
2162 for (1..$ignore_r2) { | |
2163 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start | |
2164 # print "$_ deleted $op\n"; | |
2165 | |
2166 while ($op eq 'D') { # repeating this for deletions (D) | |
2167 $D_count_2++; | |
2168 $op = shift @comp_cigar_2; | |
2169 # print "$_ deleted $op\n"; | |
2170 } | |
2171 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
2172 $I_count_2++; | |
2173 } | |
2174 } | |
2175 | |
2176 $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2; | |
2177 # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; | |
2178 } | |
2179 } | |
2180 } | |
2181 | |
2182 if ($ignore and $meth_call_1){ # if the methylation call string is undefined at this position we don't need any new CIGAR string | |
2183 | |
2184 ### reconstituting shortened CIGAR string 1 | |
2185 my $new_cigar_1; | |
2186 my $count_1 = 0; | |
2187 my $last_op_1; | |
2188 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; | |
2189 foreach my $op (@comp_cigar_1) { | |
2190 unless (defined $last_op_1){ | |
2191 $last_op_1 = $op; | |
2192 ++$count_1; | |
2193 next; | |
2194 } | |
2195 if ($last_op_1 eq $op) { | |
2196 ++$count_1; | |
2197 } else { | |
2198 $new_cigar_1 .= "$count_1$last_op_1"; | |
2199 $last_op_1 = $op; | |
2200 $count_1 = 1; | |
2201 } | |
2202 } | |
2203 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count | |
2204 $cigar_1 = $new_cigar_1; | |
2205 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; | |
2206 } | |
2207 | |
2208 if ($ignore_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string | |
2209 | |
2210 ### reconstituting shortened CIGAR string 2 | |
2211 my $new_cigar_2; | |
2212 my $count_2 = 0; | |
2213 my $last_op_2; | |
2214 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n"; | |
2215 foreach my $op (@comp_cigar_2) { | |
2216 unless (defined $last_op_2){ | |
2217 $last_op_2 = $op; | |
2218 ++$count_2; | |
2219 next; | |
2220 } | |
2221 if ($last_op_2 eq $op) { | |
2222 ++$count_2; | |
2223 } | |
2224 else { | |
2225 $new_cigar_2 .= "$count_2$last_op_2"; | |
2226 $last_op_2 = $op; | |
2227 $count_2 = 1; | |
2228 } | |
2229 } | |
2230 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count | |
2231 $cigar_2 = $new_cigar_2; | |
2232 # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n"; | |
2233 } | |
2234 | |
2235 ########################### | |
2236 ### END IGNORE 5' END ### | |
2237 ########################### | |
2238 | |
2239 ################################## | |
2240 ### IGNORE BASES FROM 3' END ### | |
2241 ################################## | |
2242 | |
2243 # print "CIGAR string before truncating 3' end (Read 1)\n"; | |
2244 # print join ("",@comp_cigar_1),"\n"; | |
2245 | |
2246 if ($ignore_3prime and $meth_call_1) { # if the methylation call string is undefined at this point we don't need to process the read any further | |
2247 | |
2248 ### Clipping off the last <int> number of bases from the methylation call string as specified with '--ignore_3prime <int>' for read 1 | |
2249 ### the methylation calls have already been reversed where necessary | |
2250 | |
2251 if ( (length($meth_call_1) - $ignore_3prime) <= 0){ | |
2252 $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored | |
2253 } | |
2254 else { | |
2255 $meth_call_1 = substr($meth_call_1,0,length($meth_call_1) - $ignore_3prime); | |
2256 # warn "truncated meth_call 1:\n$meth_call_1\n"; | |
2257 | |
2258 if ($strand eq '+') { | |
2259 | |
2260 ### if the (read 1) strand information is '+', clipping the 3' end does not affect the starting position of forward strand alignments | |
2261 # ignore 5' has already been taken care of at this stage, if relevant at all | |
2262 | |
2263 for (1..$ignore_3prime) { | |
2264 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore_3prime operations from the end | |
2265 # print "$_ deleted $op from 3' end\n"; | |
2266 # print join ("",@comp_cigar_1),"\n"; | |
2267 | |
2268 while ($op eq 'D') { # repeating this for deletions (D) | |
2269 $op = pop @comp_cigar_1; | |
2270 # print join ("",@comp_cigar_1),"\n"; | |
2271 # print "$_ deleted $op from 3' end\n"; | |
2272 } | |
2273 } | |
2274 | |
2275 # print "Final truncated CIGAR string (Read 1):\n"; | |
2276 # print join ("",@comp_cigar_1),"\n"; | |
2277 | |
2278 } | |
2279 elsif ($strand eq '-') { | |
2280 | |
2281 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions | |
2282 my $I_count_1 = 0; | |
2283 | |
2284 ### if the (read 1) strand information is '-', the read 1 CIGAR string needs to be trimmed from the start | |
2285 for (1..$ignore_3prime) { | |
2286 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array | |
2287 # print join ("",@comp_cigar_1),"\n"; | |
2288 | |
2289 while ($op eq 'D') { # repeating this for deletions (D) | |
2290 $D_count_1++; | |
2291 $op = shift @comp_cigar_1; | |
2292 # print join ("",@comp_cigar_1),"\n"; | |
2293 } | |
2294 | |
2295 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
2296 $I_count_1++; | |
2297 } | |
2298 } | |
2299 | |
2300 # print "Final truncated CIGAR string reverse_read:\n"; | |
2301 # print join ("",@comp_cigar_1),"\n"; | |
2302 | |
2303 # Here we need to add the length ignore_3prime to the read starting position | |
2304 # adjustment of the true start position of this reverse read will take place later in the methylation extraction step | |
2305 $start_read_1 += $ignore_3prime + $D_count_1 - $I_count_1; | |
2306 | |
2307 } | |
2308 } | |
2309 } | |
2310 | |
2311 if ($ignore_3prime_r2 and $meth_call_2) { # if the methylation call string is undefined at this point we don't need to process the read any further | |
2312 | |
2313 ### Clipping off the last <int> number of bases from the methylation call string as specified with '--ignore_3prime_r2 <int>' for read 2 | |
2314 ### the methylation calls have already been reversed where necessary | |
2315 | |
2316 if ( (length($meth_call_2) - $ignore_3prime_r2) <= 0){ | |
2317 $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored | |
2318 } | |
2319 else { | |
2320 $meth_call_2 = substr($meth_call_2,0,length($meth_call_2) - $ignore_3prime_r2); | |
2321 # warn "truncated meth_call 2:\n$meth_call_2\n"; | |
2322 | |
2323 ### If we are ignoring a part of the sequence we also need to adjust the cigar string and the positions accordingly | |
2324 | |
2325 if ($strand eq '+') { | |
2326 | |
2327 ### if the (read 1) strand information is '+', clipping the 3' end of read 2 does potentially affect the starting position of read 2 (reverse strand alignment) | |
2328 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the start | |
2329 | |
2330 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions | |
2331 my $I_count_2 = 0; | |
2332 | |
2333 for (1..$ignore_3prime_r2) { | |
2334 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the first value of the array | |
2335 # print "$_ deleted $op from 3' end\n"; | |
2336 # print join ("",@comp_cigar_2),"\n"; | |
2337 | |
2338 while ($op eq 'D') { # repeating this for deletions (D) | |
2339 $D_count_2++; | |
2340 $op = shift @comp_cigar_2; | |
2341 # print "$_ deleted $op from 3' end\n"; | |
2342 # print join ("",@comp_cigar_2),"\n"; | |
2343 } | |
2344 | |
2345 if ($op eq 'I') { # adjusting the genomic position for insertions (I) | |
2346 $I_count_2++; | |
2347 } | |
2348 } | |
2349 | |
2350 # print "Final truncated CIGAR string 2 (+ alignment):\n"; | |
2351 # print join ("",@comp_cigar_2),"\n"; | |
2352 | |
2353 # Here we need to add the length ignore_3prime_r2 to the read starting position | |
2354 # adjustment of the true start position of this reverse read will take place later in the methylation extraction step | |
2355 $start_read_2 += $ignore_3prime_r2 + $D_count_2 - $I_count_2; | |
2356 | |
2357 # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; | |
2358 } | |
2359 elsif ($strand eq '-') { | |
2360 ### if the (read 1) strand information is '-', clipping the 3' end of read 2 does not affect its starting position (forward strand alignment) | |
2361 ### ignore_r2 5' has already been taken care of at this stage, if relevant at all | |
2362 | |
2363 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the end | |
2364 | |
2365 for (1..$ignore_3prime_r2) { | |
2366 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start | |
2367 # print "$_ deleted $op\n"; | |
2368 # print join ("",@comp_cigar_2),"\n"; | |
2369 | |
2370 while ($op eq 'D') { # repeating this for deletions (D) | |
2371 $op = pop @comp_cigar_2; | |
2372 # print "$_ deleted $op\n"; | |
2373 # print join ("",@comp_cigar_2),"\n"; | |
2374 } | |
2375 } | |
2376 | |
2377 # print "Final truncated CIGAR string 2 (- alignment):\n"; | |
2378 # print join ("",@comp_cigar_2),"\n"; | |
2379 | |
2380 } | |
2381 } | |
2382 } | |
2383 | |
2384 if ($ignore_3prime and $meth_call_1){ # if the methylation call string is undefined at this point we don't need any new CIGAR string | |
2385 | |
2386 ### reconstituting shortened CIGAR string 1 | |
2387 my $new_cigar_1; | |
2388 my $count_1 = 0; | |
2389 my $last_op_1; | |
2390 # print "ignore_3prime adjusted CIGAR 1: @comp_cigar_1\n"; | |
2391 foreach my $op (@comp_cigar_1) { | |
2392 unless (defined $last_op_1){ | |
2393 $last_op_1 = $op; | |
2394 ++$count_1; | |
2395 next; | |
2396 } | |
2397 if ($last_op_1 eq $op) { | |
2398 ++$count_1; | |
2399 } | |
2400 else { | |
2401 $new_cigar_1 .= "$count_1$last_op_1"; | |
2402 $last_op_1 = $op; | |
2403 $count_1 = 1; | |
2404 } | |
2405 } | |
2406 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count | |
2407 $cigar_1 = $new_cigar_1; | |
2408 # warn "ignore_3prime adjusted CIGAR 1 scalar: $cigar_1\n"; | |
2409 } | |
2410 | |
2411 if ($ignore_3prime_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string | |
2412 | |
2413 ### reconstituting shortened CIGAR string 2 | |
2414 my $new_cigar_2; | |
2415 my $count_2 = 0; | |
2416 my $last_op_2; | |
2417 # print "ignore_3prime_r2 adjusted CIGAR 2: @comp_cigar_2\n"; | |
2418 foreach my $op (@comp_cigar_2) { | |
2419 unless (defined $last_op_2){ | |
2420 $last_op_2 = $op; | |
2421 ++$count_2; | |
2422 next; | |
2423 } | |
2424 if ($last_op_2 eq $op) { | |
2425 ++$count_2; | |
2426 } | |
2427 else { | |
2428 $new_cigar_2 .= "$count_2$last_op_2"; | |
2429 $last_op_2 = $op; | |
2430 $count_2 = 1; | |
2431 } | |
2432 } | |
2433 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count | |
2434 $cigar_2 = $new_cigar_2; | |
2435 # warn "ignore_3prime_r2 adjusted CIGAR 2 scalar: $cigar_2\n"; | |
2436 } | |
2437 | |
2438 ########################### | |
2439 ### END IGNORE 3' END ### | |
2440 ########################### | |
2441 | |
2442 | |
2443 ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on | |
2444 | |
2445 if ($strand eq '+') { | |
2446 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 | |
2447 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
2448 # print "reverse: @comp_cigar_2\n"; | |
2449 | |
2450 my $MD_count_1 = 0; | |
2451 foreach (@comp_cigar_1) { | |
2452 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
2453 } | |
2454 | |
2455 my $MD_count_2 = 0; | |
2456 foreach (@comp_cigar_2) { | |
2457 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
2458 } | |
2459 | |
2460 $end_read_1 = $start_read_1 + $MD_count_1 - 1; | |
2461 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand | |
2462 } | |
2463 else { | |
2464 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 | |
2465 | |
2466 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
2467 # print "reverse: @comp_cigar_1\n"; | |
2468 | |
2469 my $MD_count_1 = 0; | |
2470 foreach (@comp_cigar_1) { | |
2471 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
2472 } | |
2473 | |
2474 $end_read_1 = $start_read_1; | |
2475 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand | |
2476 } | |
2477 | |
2478 if ($strand eq '+') { | |
2479 ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity | |
2480 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1); | |
2481 | |
2482 # we next pass the second read which is in - orientation on the reverse strand | |
2483 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | |
2484 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2); | |
2485 } | |
2486 else { | |
2487 ## we first pass the first read which is in - orientation on the reverse strand | |
2488 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1); | |
2489 | |
2490 # we next pass the second read which is in + orientation on the forward strand | |
2491 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | |
2492 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2); | |
2493 } | |
2494 | |
2495 $methylation_call_strings_processed += 2; # paired-end = 2 methylation call strings | |
2496 } | |
2497 } | |
2498 } | |
2499 } else { | |
2500 die "Single-end or paired-end reads not specified properly\n"; | |
2501 } | |
2502 | |
2503 $counting{sequences_count} = $line_count; | |
2504 $counting{methylation_call_strings} = $methylation_call_strings_processed; | |
2505 | |
2506 if ($multicore == 1){ | |
2507 print_splitting_report (); | |
2508 } | |
2509 elsif ($multicore > 1){ | |
2510 print_splitting_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed); | |
2511 print_mbias_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed); | |
2512 } | |
2513 | |
2514 return ($process_id,\@pids,$report_filename); | |
2515 | |
2516 } | |
2517 | |
2518 | |
2519 | |
2520 sub print_splitting_report{ | |
2521 | |
2522 ### Calculating methylation percentages if applicable | |
2523 warn "\nProcessed $counting{sequences_count} lines in total\n"; | |
2524 warn "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n"; | |
2525 if ($report) { | |
2526 print REPORT "\nProcessed $counting{sequences_count} lines in total\n"; | |
2527 print REPORT "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n"; | |
2528 } | |
2529 | |
2530 my $percent_meCpG; | |
2531 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | |
2532 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | |
2533 } | |
2534 | |
2535 my $percent_meCHG; | |
2536 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
2537 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | |
2538 } | |
2539 | |
2540 my $percent_meCHH; | |
2541 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ | |
2542 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); | |
2543 } | |
2544 | |
2545 my $percent_non_CpG_methylation; | |
2546 if ($merge_non_CpG){ | |
2547 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
2548 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); | |
2549 } | |
2550 } | |
2551 | |
2552 if ($report){ | |
2553 ### detailed information about Cs analysed | |
2554 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
2555 | |
2556 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
2557 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
2558 | |
2559 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
2560 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
2561 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | |
2562 | |
2563 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
2564 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
2565 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | |
2566 | |
2567 ### calculating methylated CpG percentage if applicable | |
2568 if ($percent_meCpG){ | |
2569 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
2570 } | |
2571 else{ | |
2572 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
2573 } | |
2574 | |
2575 ### 2-Context Output | |
2576 if ($merge_non_CpG){ | |
2577 if ($percent_non_CpG_methylation){ | |
2578 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; | |
2579 } | |
2580 else{ | |
2581 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; | |
2582 } | |
2583 } | |
2584 | |
2585 ### 3 Context Output | |
2586 else{ | |
2587 ### calculating methylated CHG percentage if applicable | |
2588 if ($percent_meCHG){ | |
2589 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
2590 } | |
2591 else{ | |
2592 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
2593 } | |
2594 | |
2595 ### calculating methylated CHH percentage if applicable | |
2596 if ($percent_meCHH){ | |
2597 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | |
2598 } | |
2599 else{ | |
2600 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | |
2601 } | |
2602 } | |
2603 } | |
2604 | |
2605 ### detailed information about Cs analysed for on-screen report | |
2606 warn "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
2607 | |
2608 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
2609 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
2610 | |
2611 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
2612 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
2613 warn"Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | |
2614 | |
2615 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
2616 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
2617 warn"Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | |
2618 | |
2619 ### printing methylated CpG percentage if applicable | |
2620 if ($percent_meCpG){ | |
2621 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
2622 } | |
2623 else{ | |
2624 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
2625 } | |
2626 | |
2627 ### 2-Context Output | |
2628 if ($merge_non_CpG){ | |
2629 if ($percent_non_CpG_methylation){ | |
2630 warn "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; | |
2631 } | |
2632 else{ | |
2633 warn "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; | |
2634 } | |
2635 } | |
2636 | |
2637 ### 3-Context Output | |
2638 else{ | |
2639 ### printing methylated CHG percentage if applicable | |
2640 if ($percent_meCHG){ | |
2641 warn "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
2642 } | |
2643 else{ | |
2644 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
2645 } | |
2646 | |
2647 ### printing methylated CHH percentage if applicable | |
2648 if ($percent_meCHH){ | |
2649 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | |
2650 } | |
2651 else{ | |
2652 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | |
2653 } | |
2654 } | |
2655 } | |
2656 | |
2657 ### | |
2658 | |
2659 sub print_splitting_report_multicore{ | |
2660 | |
2661 my ($report_filename,$offset,$line_count,$meth_call_strings) = @_; | |
2662 | |
2663 # warn "\$report_filename is $report_filename\n"; | |
2664 my $special_report = $report_filename.".$offset"; | |
2665 | |
2666 open (SPECIAL_REPORT,'>',$special_report) or die $!; | |
2667 # warn "line count\t$line_count\n"; | |
2668 # warn "meth call strings\t$meth_call_strings\n"; | |
2669 | |
2670 print SPECIAL_REPORT "line count\t$line_count\n"; | |
2671 print SPECIAL_REPORT "meth call strings\t$meth_call_strings\n"; | |
2672 | |
2673 ### Calculating methylation percentages if applicable | |
2674 my $percent_meCpG; | |
2675 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | |
2676 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | |
2677 } | |
2678 | |
2679 my $percent_meCHG; | |
2680 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
2681 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | |
2682 } | |
2683 | |
2684 my $percent_meCHH; | |
2685 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ | |
2686 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); | |
2687 } | |
2688 | |
2689 my $percent_non_CpG_methylation; | |
2690 if ($merge_non_CpG){ | |
2691 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
2692 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); | |
2693 } | |
2694 } | |
2695 | |
2696 if ($report){ | |
2697 | |
2698 ### detailed information about Cs analysed | |
2699 print SPECIAL_REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
2700 | |
2701 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
2702 print SPECIAL_REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
2703 | |
2704 print SPECIAL_REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
2705 print SPECIAL_REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
2706 print SPECIAL_REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | |
2707 | |
2708 print SPECIAL_REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
2709 print SPECIAL_REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
2710 print SPECIAL_REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | |
2711 | |
2712 ### calculating methylated CpG percentage if applicable | |
2713 if ($percent_meCpG){ | |
2714 print SPECIAL_REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
2715 } | |
2716 else{ | |
2717 print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
2718 } | |
2719 | |
2720 ### 2-Context Output | |
2721 if ($merge_non_CpG){ | |
2722 if ($percent_non_CpG_methylation){ | |
2723 print SPECIAL_REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; | |
2724 } | |
2725 else{ | |
2726 print SPECIAL_REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; | |
2727 } | |
2728 } | |
2729 | |
2730 ### 3 Context Output | |
2731 else{ | |
2732 ### calculating methylated CHG percentage if applicable | |
2733 if ($percent_meCHG){ | |
2734 print SPECIAL_REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
2735 } | |
2736 else{ | |
2737 print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
2738 } | |
2739 | |
2740 ### calculating methylated CHH percentage if applicable | |
2741 if ($percent_meCHH){ | |
2742 print SPECIAL_REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | |
2743 } | |
2744 else{ | |
2745 print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | |
2746 } | |
2747 } | |
2748 } | |
2749 close SPECIAL_REPORT or warn "Failed to close filehandle for individual report $special_report\n"; | |
2750 } | |
2751 | |
2752 | |
2753 ### | |
2754 | |
2755 ### INDIVIDUAL M-BIAS REPORTS | |
2756 | |
2757 sub print_mbias_report_multicore{ | |
2758 | |
2759 my ($report_filename,$offset,$line_count,$meth_call_strings) = @_; | |
2760 | |
2761 # warn "\$report_filename is $report_filename\n"; | |
2762 my $special_mbias_report = $report_filename.".${offset}.mbias"; | |
2763 | |
2764 open (SPECIAL_MBIAS,'>',$special_mbias_report) or die $!; | |
2765 | |
2766 # determining maximum read length | |
2767 my $max_length_1 = 0; | |
2768 my $max_length_2 = 0; | |
2769 | |
2770 foreach my $context (keys %mbias_1){ | |
2771 foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){ | |
2772 $max_length_1 = $pos unless ($max_length_1 >= $pos); | |
2773 } | |
2774 } | |
2775 if ($paired){ | |
2776 foreach my $context (keys %mbias_2){ | |
2777 foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){ | |
2778 $max_length_2 = $pos unless ($max_length_2 >= $pos); | |
2779 } | |
2780 } | |
2781 } | |
2782 | |
2783 if ($single){ | |
2784 # warn "Determining maximum read length for M-Bias plot\n"; | |
2785 # warn "Maximum read length of Read 1: $max_length_1\n\n"; | |
2786 } | |
2787 else{ | |
2788 # warn "Determining maximum read lengths for M-Bias plots\n"; | |
2789 # warn "Maximum read length of Read 1: $max_length_1\n"; | |
2790 # warn "Maximum read length of Read 2: $max_length_2\n\n"; | |
2791 } | |
2792 | |
2793 foreach my $context (qw(CpG CHG CHH)){ | |
2794 | |
2795 if ($paired){ | |
2796 print SPECIAL_MBIAS "$context context (R1)\n================\n"; | |
2797 } | |
2798 else{ | |
2799 print SPECIAL_MBIAS "$context context\n===========\n"; | |
2800 } | |
2801 print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; | |
2802 | |
2803 foreach my $pos (1..$max_length_1){ | |
2804 | |
2805 unless (defined $mbias_1{$context}->{$pos}->{meth}){ | |
2806 $mbias_1{$context}->{$pos}->{meth} = 0; | |
2807 } | |
2808 unless (defined $mbias_1{$context}->{$pos}->{un}){ | |
2809 $mbias_1{$context}->{$pos}->{un} = 0; | |
2810 } | |
2811 | |
2812 my $percent = ''; | |
2813 if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){ | |
2814 $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) ); | |
2815 } | |
2816 my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth}; | |
2817 | |
2818 print SPECIAL_MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n"; | |
2819 } | |
2820 print SPECIAL_MBIAS "\n"; | |
2821 } | |
2822 | |
2823 if ($paired){ | |
2824 | |
2825 foreach my $context (qw(CpG CHG CHH)){ | |
2826 | |
2827 print SPECIAL_MBIAS "$context context (R2)\n================\n"; | |
2828 print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; | |
2829 | |
2830 foreach my $pos (1..$max_length_2){ | |
2831 | |
2832 unless (defined $mbias_2{$context}->{$pos}->{meth}){ | |
2833 $mbias_2{$context}->{$pos}->{meth} = 0; | |
2834 } | |
2835 unless (defined $mbias_2{$context}->{$pos}->{un}){ | |
2836 $mbias_2{$context}->{$pos}->{un} = 0; | |
2837 } | |
2838 | |
2839 my $percent = ''; | |
2840 if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){ | |
2841 $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) ); | |
2842 } | |
2843 my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth}; | |
2844 | |
2845 print SPECIAL_MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n"; | |
2846 } | |
2847 } | |
2848 } | |
2849 | |
2850 close SPECIAL_MBIAS or warn "Failed to close filehandle for individual M-bias report $special_mbias_report\n"; | |
2851 } | |
2852 | |
2853 | |
2854 ### | |
2855 | |
2856 | |
2857 sub print_individual_C_methylation_states_paired_end_files{ | |
2858 | |
2859 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_; | |
2860 | |
2861 unless (defined $meth_call) { | |
2862 return; # skip this read | |
2863 } | |
2864 | |
2865 ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2 | |
2866 die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2); | |
2867 | |
2868 my @methylation_calls = split(//,$meth_call); | |
2869 | |
2870 ################################################################# | |
2871 ### . for bases not involving cytosines ### | |
2872 ### X for methylated C in CHG context (was protected) ### | |
2873 ### x for not methylated C in CHG context (was converted) ### | |
2874 ### H for methylated C in CHH context (was protected) ### | |
2875 ### h for not methylated C in CHH context (was converted) ### | |
2876 ### Z for methylated C in CpG context (was protected) ### | |
2877 ### z for not methylated C in CpG context (was converted) ### | |
2878 ### U for methylated C in Unknown context (was protected) ### | |
2879 ### u for not methylated C in Unknown context (was converted) ### | |
2880 ################################################################# | |
2881 | |
2882 my $methyl_CHG_count = 0; | |
2883 my $methyl_CHH_count = 0; | |
2884 my $methyl_CpG_count = 0; | |
2885 my $unmethylated_CHG_count = 0; | |
2886 my $unmethylated_CHH_count = 0; | |
2887 my $unmethylated_CpG_count = 0; | |
2888 | |
2889 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | |
2890 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | |
2891 my @comp_cigar; | |
2892 | |
2893 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing | |
2894 if ($cigar =~ /^\d+M$/){ | |
2895 # this check speeds up the extraction process by up to 60%!!! | |
2896 } | |
2897 else{ # parsing CIGAR string | |
2898 my @len; | |
2899 my @ops; | |
2900 @len = split (/\D+/,$cigar); # storing the length per operation | |
2901 @ops = split (/\d+/,$cigar); # storing the operation | |
2902 shift @ops; # remove the empty first element | |
2903 | |
2904 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
2905 | |
2906 foreach my $index (0..$#len){ | |
2907 foreach (1..$len[$index]){ | |
2908 # print "$ops[$index]"; | |
2909 push @comp_cigar, $ops[$index]; | |
2910 } | |
2911 } | |
2912 # warn "\nDetected CIGAR string: $cigar\n"; | |
2913 # warn "Length of methylation call: ",length $meth_call,"\n"; | |
2914 # warn "number of operations: ",scalar @ops,"\n"; | |
2915 # warn "number of length digits: ",scalar @len,"\n\n"; | |
2916 # print @comp_cigar,"\n"; | |
2917 # print "$meth_call\n\n"; | |
2918 # sleep (1); | |
2919 } | |
2920 | |
2921 if ($strand eq '-') { | |
2922 | |
2923 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above | |
2924 if (@comp_cigar){ | |
2925 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
2926 } | |
2927 # print "reverse CIGAR string: @comp_cigar\n"; | |
2928 | |
2929 ### the start position of paired-end files has already been corrected, see above | |
2930 } | |
2931 | |
2932 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified | |
2933 | |
2934 if ($merge_non_CpG) { | |
2935 if ($no_overlap) { # this has to be read 2... | |
2936 | |
2937 ### single-file CpG and non-CpG context output | |
2938 if ($full) { | |
2939 if ($strand eq '+') { | |
2940 for my $index (0..$#methylation_calls) { | |
2941 | |
2942 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
2943 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
2944 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
2945 $cigar_offset += $cigar_mod; | |
2946 $pos_offset += $pos_mod; | |
2947 } | |
2948 | |
2949 ### Returning as soon as the methylation calls start overlapping | |
2950 if ($start+$index+$pos_offset >= $end_read_1) { | |
2951 return; | |
2952 } | |
2953 | |
2954 if ($methylation_calls[$index] eq 'X') { | |
2955 $counting{total_meCHG_count}++; | |
2956 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
2957 if ($read_identity == 1){ | |
2958 $mbias_1{CHG}->{$index+1}->{meth}++; | |
2959 } | |
2960 else{ | |
2961 $mbias_2{CHG}->{$index+1}->{meth}++; | |
2962 } | |
2963 } | |
2964 elsif ($methylation_calls[$index] eq 'x') { | |
2965 $counting{total_unmethylated_CHG_count}++; | |
2966 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
2967 if ($read_identity == 1){ | |
2968 $mbias_1{CHG}->{$index+1}->{un}++; | |
2969 } | |
2970 else{ | |
2971 $mbias_2{CHG}->{$index+1}->{un}++; | |
2972 } | |
2973 } | |
2974 elsif ($methylation_calls[$index] eq 'Z') { | |
2975 $counting{total_meCpG_count}++; | |
2976 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
2977 if ($read_identity == 1){ | |
2978 $mbias_1{CpG}->{$index+1}->{meth}++; | |
2979 } | |
2980 else{ | |
2981 $mbias_2{CpG}->{$index+1}->{meth}++; | |
2982 } | |
2983 } | |
2984 elsif ($methylation_calls[$index] eq 'z') { | |
2985 $counting{total_unmethylated_CpG_count}++; | |
2986 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
2987 if ($read_identity == 1){ | |
2988 $mbias_1{CpG}->{$index+1}->{un}++; | |
2989 } | |
2990 else{ | |
2991 $mbias_2{CpG}->{$index+1}->{un}++; | |
2992 } | |
2993 } | |
2994 elsif ($methylation_calls[$index] eq 'H') { | |
2995 $counting{total_meCHH_count}++; | |
2996 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
2997 if ($read_identity == 1){ | |
2998 $mbias_1{CHH}->{$index+1}->{meth}++; | |
2999 } | |
3000 else{ | |
3001 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3002 } | |
3003 } | |
3004 elsif ($methylation_calls[$index] eq 'h') { | |
3005 $counting{total_unmethylated_CHH_count}++; | |
3006 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3007 if ($read_identity == 1){ | |
3008 $mbias_1{CHH}->{$index+1}->{un}++; | |
3009 } | |
3010 else{ | |
3011 $mbias_2{CHH}->{$index+1}->{un}++; | |
3012 } | |
3013 } | |
3014 elsif ($methylation_calls[$index] eq '.'){} | |
3015 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3016 else{ | |
3017 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
3018 } | |
3019 } | |
3020 } | |
3021 elsif ($strand eq '-') { | |
3022 for my $index (0..$#methylation_calls) { | |
3023 | |
3024 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3025 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
3026 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3027 $cigar_offset += $cigar_mod; | |
3028 $pos_offset += $pos_mod; | |
3029 } | |
3030 | |
3031 ### Returning as soon as the methylation calls start overlapping | |
3032 if ($start-$index+$pos_offset <= $end_read_1) { | |
3033 return; | |
3034 } | |
3035 | |
3036 if ($methylation_calls[$index] eq 'X') { | |
3037 $counting{total_meCHG_count}++; | |
3038 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3039 if ($read_identity == 1){ | |
3040 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3041 } | |
3042 else{ | |
3043 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3044 } | |
3045 } | |
3046 elsif ($methylation_calls[$index] eq 'x') { | |
3047 $counting{total_unmethylated_CHG_count}++; | |
3048 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3049 if ($read_identity == 1){ | |
3050 $mbias_1{CHG}->{$index+1}->{un}++; | |
3051 } | |
3052 else{ | |
3053 $mbias_2{CHG}->{$index+1}->{un}++; | |
3054 } | |
3055 } | |
3056 elsif ($methylation_calls[$index] eq 'Z') { | |
3057 $counting{total_meCpG_count}++; | |
3058 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3059 if ($read_identity == 1){ | |
3060 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3061 } | |
3062 else{ | |
3063 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3064 } | |
3065 } | |
3066 elsif ($methylation_calls[$index] eq 'z') { | |
3067 $counting{total_unmethylated_CpG_count}++; | |
3068 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3069 if ($read_identity == 1){ | |
3070 $mbias_1{CpG}->{$index+1}->{un}++; | |
3071 } | |
3072 else{ | |
3073 $mbias_2{CpG}->{$index+1}->{un}++; | |
3074 } | |
3075 } | |
3076 elsif ($methylation_calls[$index] eq 'H') { | |
3077 $counting{total_meCHH_count}++; | |
3078 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3079 if ($read_identity == 1){ | |
3080 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3081 } | |
3082 else{ | |
3083 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3084 } | |
3085 } | |
3086 elsif ($methylation_calls[$index] eq 'h') { | |
3087 $counting{total_unmethylated_CHH_count}++; | |
3088 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3089 if ($read_identity == 1){ | |
3090 $mbias_1{CHH}->{$index+1}->{un}++; | |
3091 } | |
3092 else{ | |
3093 $mbias_2{CHH}->{$index+1}->{un}++; | |
3094 } | |
3095 } | |
3096 elsif ($methylation_calls[$index] eq '.') {} | |
3097 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3098 else{ | |
3099 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
3100 } | |
3101 } | |
3102 } else { | |
3103 die "The read orientation was neither + nor -: '$strand'\n"; | |
3104 } | |
3105 } | |
3106 | |
3107 ### strand-specific methylation output | |
3108 else { | |
3109 if ($strand eq '+') { | |
3110 for my $index (0..$#methylation_calls) { | |
3111 | |
3112 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3113 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3114 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
3115 $cigar_offset += $cigar_mod; | |
3116 $pos_offset += $pos_mod; | |
3117 } | |
3118 | |
3119 ### Returning as soon as the methylation calls start overlapping | |
3120 if ($start+$index+$pos_offset >= $end_read_1) { | |
3121 return; | |
3122 } | |
3123 | |
3124 if ($methylation_calls[$index] eq 'X') { | |
3125 $counting{total_meCHG_count}++; | |
3126 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3127 if ($read_identity == 1){ | |
3128 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3129 } | |
3130 else{ | |
3131 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3132 } | |
3133 } | |
3134 elsif ($methylation_calls[$index] eq 'x') { | |
3135 $counting{total_unmethylated_CHG_count}++; | |
3136 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3137 if ($read_identity == 1){ | |
3138 $mbias_1{CHG}->{$index+1}->{un}++; | |
3139 } | |
3140 else{ | |
3141 $mbias_2{CHG}->{$index+1}->{un}++; | |
3142 } | |
3143 } | |
3144 elsif ($methylation_calls[$index] eq 'Z') { | |
3145 $counting{total_meCpG_count}++; | |
3146 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3147 if ($read_identity == 1){ | |
3148 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3149 } | |
3150 else{ | |
3151 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3152 } | |
3153 } | |
3154 elsif ($methylation_calls[$index] eq 'z') { | |
3155 $counting{total_unmethylated_CpG_count}++; | |
3156 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3157 if ($read_identity == 1){ | |
3158 $mbias_1{CpG}->{$index+1}->{un}++; | |
3159 } | |
3160 else{ | |
3161 $mbias_2{CpG}->{$index+1}->{un}++; | |
3162 } | |
3163 } | |
3164 elsif ($methylation_calls[$index] eq 'H') { | |
3165 $counting{total_meCHH_count}++; | |
3166 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3167 if ($read_identity == 1){ | |
3168 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3169 } | |
3170 else{ | |
3171 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3172 } | |
3173 } | |
3174 elsif ($methylation_calls[$index] eq 'h') { | |
3175 $counting{total_unmethylated_CHH_count}++; | |
3176 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3177 if ($read_identity == 1){ | |
3178 $mbias_1{CHH}->{$index+1}->{un}++; | |
3179 } | |
3180 else{ | |
3181 $mbias_2{CHH}->{$index+1}->{un}++; | |
3182 } | |
3183 } | |
3184 elsif ($methylation_calls[$index] eq '.') {} | |
3185 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3186 else{ | |
3187 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3188 } | |
3189 } | |
3190 } elsif ($strand eq '-') { | |
3191 for my $index (0..$#methylation_calls) { | |
3192 | |
3193 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3194 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
3195 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3196 $cigar_offset += $cigar_mod; | |
3197 $pos_offset += $pos_mod; | |
3198 } | |
3199 | |
3200 ### Returning as soon as the methylation calls start overlapping | |
3201 if ($start-$index+$pos_offset <= $end_read_1) { | |
3202 return; | |
3203 } | |
3204 | |
3205 if ($methylation_calls[$index] eq 'X') { | |
3206 $counting{total_meCHG_count}++; | |
3207 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3208 if ($read_identity == 1){ | |
3209 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3210 } | |
3211 else{ | |
3212 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3213 } | |
3214 } | |
3215 elsif ($methylation_calls[$index] eq 'x') { | |
3216 $counting{total_unmethylated_CHG_count}++; | |
3217 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3218 if ($read_identity == 1){ | |
3219 $mbias_1{CHG}->{$index+1}->{un}++; | |
3220 } | |
3221 else{ | |
3222 $mbias_2{CHG}->{$index+1}->{un}++; | |
3223 } | |
3224 } | |
3225 elsif ($methylation_calls[$index] eq 'Z') { | |
3226 $counting{total_meCpG_count}++; | |
3227 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3228 if ($read_identity == 1){ | |
3229 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3230 } | |
3231 else{ | |
3232 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3233 } | |
3234 } | |
3235 elsif ($methylation_calls[$index] eq 'z') { | |
3236 $counting{total_unmethylated_CpG_count}++; | |
3237 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3238 if ($read_identity == 1){ | |
3239 $mbias_1{CpG}->{$index+1}->{un}++; | |
3240 } | |
3241 else{ | |
3242 $mbias_2{CpG}->{$index+1}->{un}++; | |
3243 } | |
3244 } | |
3245 elsif ($methylation_calls[$index] eq 'H') { | |
3246 $counting{total_meCHH_count}++; | |
3247 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3248 if ($read_identity == 1){ | |
3249 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3250 } | |
3251 else{ | |
3252 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3253 } | |
3254 } | |
3255 elsif ($methylation_calls[$index] eq 'h') { | |
3256 $counting{total_unmethylated_CHH_count}++; | |
3257 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3258 if ($read_identity == 1){ | |
3259 $mbias_1{CHH}->{$index+1}->{un}++; | |
3260 } | |
3261 else{ | |
3262 $mbias_2{CHH}->{$index+1}->{un}++; | |
3263 } | |
3264 } | |
3265 elsif ($methylation_calls[$index] eq '.') {} | |
3266 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3267 else{ | |
3268 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3269 } | |
3270 } | |
3271 } else { | |
3272 die "The strand orientation was neither + nor -: '$strand'/n"; | |
3273 } | |
3274 } | |
3275 } | |
3276 | |
3277 ### this is the default paired-end procedure allowing overlaps and using every single C position | |
3278 ### Still within the 2-CONTEXT ONLY optional section | |
3279 else { | |
3280 ### single-file CpG and non-CpG context output | |
3281 if ($full) { | |
3282 if ($strand eq '+') { | |
3283 for my $index (0..$#methylation_calls) { | |
3284 | |
3285 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3286 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3287 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
3288 $cigar_offset += $cigar_mod; | |
3289 $pos_offset += $pos_mod; | |
3290 } | |
3291 | |
3292 if ($methylation_calls[$index] eq 'X') { | |
3293 $counting{total_meCHG_count}++; | |
3294 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3295 if ($read_identity == 1){ | |
3296 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3297 } | |
3298 else{ | |
3299 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3300 } | |
3301 } | |
3302 elsif ($methylation_calls[$index] eq 'x') { | |
3303 $counting{total_unmethylated_CHG_count}++; | |
3304 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3305 if ($read_identity == 1){ | |
3306 $mbias_1{CHG}->{$index+1}->{un}++; | |
3307 } | |
3308 else{ | |
3309 $mbias_2{CHG}->{$index+1}->{un}++; | |
3310 } | |
3311 } | |
3312 elsif ($methylation_calls[$index] eq 'Z') { | |
3313 $counting{total_meCpG_count}++; | |
3314 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3315 if ($read_identity == 1){ | |
3316 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3317 } | |
3318 else{ | |
3319 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3320 } | |
3321 } | |
3322 elsif ($methylation_calls[$index] eq 'z') { | |
3323 $counting{total_unmethylated_CpG_count}++; | |
3324 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3325 if ($read_identity == 1){ | |
3326 $mbias_1{CpG}->{$index+1}->{un}++; | |
3327 } | |
3328 else{ | |
3329 $mbias_2{CpG}->{$index+1}->{un}++; | |
3330 } | |
3331 } | |
3332 elsif ($methylation_calls[$index] eq 'H') { | |
3333 $counting{total_meCHH_count}++; | |
3334 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3335 if ($read_identity == 1){ | |
3336 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3337 } | |
3338 else{ | |
3339 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3340 } | |
3341 } | |
3342 elsif ($methylation_calls[$index] eq 'h') { | |
3343 $counting{total_unmethylated_CHH_count}++; | |
3344 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3345 if ($read_identity == 1){ | |
3346 $mbias_1{CHH}->{$index+1}->{un}++; | |
3347 } | |
3348 else{ | |
3349 $mbias_2{CHH}->{$index+1}->{un}++; | |
3350 } | |
3351 } | |
3352 elsif ($methylation_calls[$index] eq '.') {} | |
3353 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3354 else{ | |
3355 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
3356 } | |
3357 } | |
3358 } elsif ($strand eq '-') { | |
3359 for my $index (0..$#methylation_calls) { | |
3360 | |
3361 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3362 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
3363 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3364 $cigar_offset += $cigar_mod; | |
3365 $pos_offset += $pos_mod; | |
3366 } | |
3367 | |
3368 if ($methylation_calls[$index] eq 'X') { | |
3369 $counting{total_meCHG_count}++; | |
3370 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3371 if ($read_identity == 1){ | |
3372 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3373 } | |
3374 else{ | |
3375 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3376 } | |
3377 } | |
3378 elsif ($methylation_calls[$index] eq 'x') { | |
3379 $counting{total_unmethylated_CHG_count}++; | |
3380 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3381 if ($read_identity == 1){ | |
3382 $mbias_1{CHG}->{$index+1}->{un}++; | |
3383 } | |
3384 else{ | |
3385 $mbias_2{CHG}->{$index+1}->{un}++; | |
3386 } | |
3387 } | |
3388 elsif ($methylation_calls[$index] eq 'Z') { | |
3389 $counting{total_meCpG_count}++; | |
3390 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3391 if ($read_identity == 1){ | |
3392 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3393 } | |
3394 else{ | |
3395 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3396 } | |
3397 } | |
3398 elsif ($methylation_calls[$index] eq 'z') { | |
3399 $counting{total_unmethylated_CpG_count}++; | |
3400 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3401 if ($read_identity == 1){ | |
3402 $mbias_1{CpG}->{$index+1}->{un}++; | |
3403 } | |
3404 else{ | |
3405 $mbias_2{CpG}->{$index+1}->{un}++; | |
3406 } | |
3407 } | |
3408 elsif ($methylation_calls[$index] eq 'H') { | |
3409 $counting{total_meCHH_count}++; | |
3410 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3411 if ($read_identity == 1){ | |
3412 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3413 } | |
3414 else{ | |
3415 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3416 } | |
3417 } | |
3418 elsif ($methylation_calls[$index] eq 'h') { | |
3419 $counting{total_unmethylated_CHH_count}++; | |
3420 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3421 if ($read_identity == 1){ | |
3422 $mbias_1{CHH}->{$index+1}->{un}++; | |
3423 } | |
3424 else{ | |
3425 $mbias_2{CHH}->{$index+1}->{un}++; | |
3426 } | |
3427 } | |
3428 elsif ($methylation_calls[$index] eq '.') {} | |
3429 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3430 else{ | |
3431 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
3432 } | |
3433 } | |
3434 } else { | |
3435 die "The strand orientation as neither + nor -: '$strand'\n"; | |
3436 } | |
3437 } | |
3438 | |
3439 ### strand-specific methylation output | |
3440 ### still within the 2-CONTEXT optional section | |
3441 else { | |
3442 if ($strand eq '+') { | |
3443 for my $index (0..$#methylation_calls) { | |
3444 | |
3445 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3446 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3447 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
3448 $cigar_offset += $cigar_mod; | |
3449 $pos_offset += $pos_mod; | |
3450 } | |
3451 | |
3452 if ($methylation_calls[$index] eq 'X') { | |
3453 $counting{total_meCHG_count}++; | |
3454 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3455 if ($read_identity == 1){ | |
3456 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3457 } | |
3458 else{ | |
3459 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3460 } | |
3461 } | |
3462 elsif ($methylation_calls[$index] eq 'x') { | |
3463 $counting{total_unmethylated_CHG_count}++; | |
3464 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3465 if ($read_identity == 1){ | |
3466 $mbias_1{CHG}->{$index+1}->{un}++; | |
3467 } | |
3468 else{ | |
3469 $mbias_2{CHG}->{$index+1}->{un}++; | |
3470 } | |
3471 } | |
3472 elsif ($methylation_calls[$index] eq 'Z') { | |
3473 $counting{total_meCpG_count}++; | |
3474 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3475 if ($read_identity == 1){ | |
3476 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3477 } | |
3478 else{ | |
3479 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3480 } | |
3481 } | |
3482 elsif ($methylation_calls[$index] eq 'z') { | |
3483 $counting{total_unmethylated_CpG_count}++; | |
3484 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3485 if ($read_identity == 1){ | |
3486 $mbias_1{CpG}->{$index+1}->{un}++; | |
3487 } | |
3488 else{ | |
3489 $mbias_2{CpG}->{$index+1}->{un}++; | |
3490 } | |
3491 } | |
3492 elsif ($methylation_calls[$index] eq 'H') { | |
3493 $counting{total_meCHH_count}++; | |
3494 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3495 if ($read_identity == 1){ | |
3496 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3497 } | |
3498 else{ | |
3499 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3500 } | |
3501 } | |
3502 elsif ($methylation_calls[$index] eq 'h') { | |
3503 $counting{total_unmethylated_CHH_count}++; | |
3504 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3505 if ($read_identity == 1){ | |
3506 $mbias_1{CHH}->{$index+1}->{un}++; | |
3507 } | |
3508 else{ | |
3509 $mbias_2{CHH}->{$index+1}->{un}++; | |
3510 } | |
3511 } | |
3512 elsif ($methylation_calls[$index] eq '.') {} | |
3513 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3514 else{ | |
3515 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3516 } | |
3517 } | |
3518 } elsif ($strand eq '-') { | |
3519 for my $index (0..$#methylation_calls) { | |
3520 | |
3521 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3522 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
3523 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3524 $cigar_offset += $cigar_mod; | |
3525 $pos_offset += $pos_mod; | |
3526 } | |
3527 | |
3528 if ($methylation_calls[$index] eq 'X') { | |
3529 $counting{total_meCHG_count}++; | |
3530 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3531 if ($read_identity == 1){ | |
3532 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3533 } | |
3534 else{ | |
3535 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3536 } | |
3537 } | |
3538 elsif ($methylation_calls[$index] eq 'x') { | |
3539 $counting{total_unmethylated_CHG_count}++; | |
3540 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3541 if ($read_identity == 1){ | |
3542 $mbias_1{CHG}->{$index+1}->{un}++; | |
3543 } | |
3544 else{ | |
3545 $mbias_2{CHG}->{$index+1}->{un}++; | |
3546 } | |
3547 } | |
3548 elsif ($methylation_calls[$index] eq 'Z') { | |
3549 $counting{total_meCpG_count}++; | |
3550 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3551 if ($read_identity == 1){ | |
3552 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3553 } | |
3554 else{ | |
3555 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3556 } | |
3557 } | |
3558 elsif ($methylation_calls[$index] eq 'z') { | |
3559 $counting{total_unmethylated_CpG_count}++; | |
3560 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3561 if ($read_identity == 1){ | |
3562 $mbias_1{CpG}->{$index+1}->{un}++; | |
3563 } | |
3564 else{ | |
3565 $mbias_2{CpG}->{$index+1}->{un}++; | |
3566 } | |
3567 } | |
3568 elsif ($methylation_calls[$index] eq 'H') { | |
3569 $counting{total_meCHH_count}++; | |
3570 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3571 if ($read_identity == 1){ | |
3572 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3573 } | |
3574 else{ | |
3575 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3576 } | |
3577 } | |
3578 elsif ($methylation_calls[$index] eq 'h') { | |
3579 $counting{total_unmethylated_CHH_count}++; | |
3580 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3581 if ($read_identity == 1){ | |
3582 $mbias_1{CHH}->{$index+1}->{un}++; | |
3583 } | |
3584 else{ | |
3585 $mbias_2{CHH}->{$index+1}->{un}++; | |
3586 } | |
3587 } | |
3588 elsif ($methylation_calls[$index] eq '.') {} | |
3589 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3590 else{ | |
3591 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3592 } | |
3593 } | |
3594 } else { | |
3595 die "The strand orientation as neither + nor -: '$strand'\n"; | |
3596 } | |
3597 } | |
3598 } | |
3599 } | |
3600 | |
3601 ############################################ | |
3602 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### | |
3603 ############################################ | |
3604 | |
3605 elsif ($no_overlap) { | |
3606 ### single-file CpG, CHG and CHH context output | |
3607 if ($full) { | |
3608 if ($strand eq '+') { | |
3609 for my $index (0..$#methylation_calls) { | |
3610 | |
3611 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3612 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3613 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
3614 $cigar_offset += $cigar_mod; | |
3615 $pos_offset += $pos_mod; | |
3616 } | |
3617 | |
3618 ### Returning as soon as the methylation calls start overlapping | |
3619 if ($start+$index+$pos_offset >= $end_read_1) { | |
3620 return; | |
3621 } | |
3622 | |
3623 if ($methylation_calls[$index] eq 'X') { | |
3624 $counting{total_meCHG_count}++; | |
3625 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3626 if ($read_identity == 1){ | |
3627 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3628 } | |
3629 else{ | |
3630 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3631 } | |
3632 } | |
3633 elsif ($methylation_calls[$index] eq 'x') { | |
3634 $counting{total_unmethylated_CHG_count}++; | |
3635 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3636 if ($read_identity == 1){ | |
3637 $mbias_1{CHG}->{$index+1}->{un}++; | |
3638 } | |
3639 else{ | |
3640 $mbias_2{CHG}->{$index+1}->{un}++; | |
3641 } | |
3642 } | |
3643 elsif ($methylation_calls[$index] eq 'Z') { | |
3644 $counting{total_meCpG_count}++; | |
3645 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3646 if ($read_identity == 1){ | |
3647 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3648 } | |
3649 else{ | |
3650 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3651 } | |
3652 } | |
3653 elsif ($methylation_calls[$index] eq 'z') { | |
3654 $counting{total_unmethylated_CpG_count}++; | |
3655 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3656 if ($read_identity == 1){ | |
3657 $mbias_1{CpG}->{$index+1}->{un}++; | |
3658 } | |
3659 else{ | |
3660 $mbias_2{CpG}->{$index+1}->{un}++; | |
3661 } | |
3662 } | |
3663 elsif ($methylation_calls[$index] eq 'H') { | |
3664 $counting{total_meCHH_count}++; | |
3665 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3666 if ($read_identity == 1){ | |
3667 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3668 } | |
3669 else{ | |
3670 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3671 } | |
3672 } | |
3673 elsif ($methylation_calls[$index] eq 'h') { | |
3674 $counting{total_unmethylated_CHH_count}++; | |
3675 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3676 if ($read_identity == 1){ | |
3677 $mbias_1{CHH}->{$index+1}->{un}++; | |
3678 } | |
3679 else{ | |
3680 $mbias_2{CHH}->{$index+1}->{un}++; | |
3681 } | |
3682 } | |
3683 elsif ($methylation_calls[$index] eq '.') {} | |
3684 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3685 else{ | |
3686 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3687 } | |
3688 } | |
3689 } elsif ($strand eq '-') { | |
3690 for my $index (0..$#methylation_calls) { | |
3691 | |
3692 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3693 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
3694 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3695 $cigar_offset += $cigar_mod; | |
3696 $pos_offset += $pos_mod; | |
3697 } | |
3698 | |
3699 ### Returning as soon as the methylation calls start overlapping | |
3700 if ($start-$index+$pos_offset <= $end_read_1) { | |
3701 return; | |
3702 } | |
3703 | |
3704 if ($methylation_calls[$index] eq 'X') { | |
3705 $counting{total_meCHG_count}++; | |
3706 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3707 if ($read_identity == 1){ | |
3708 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3709 } | |
3710 else{ | |
3711 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3712 } | |
3713 } | |
3714 elsif ($methylation_calls[$index] eq 'x') { | |
3715 $counting{total_unmethylated_CHG_count}++; | |
3716 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3717 if ($read_identity == 1){ | |
3718 $mbias_1{CHG}->{$index+1}->{un}++; | |
3719 } | |
3720 else{ | |
3721 $mbias_2{CHG}->{$index+1}->{un}++; | |
3722 } | |
3723 } | |
3724 elsif ($methylation_calls[$index] eq 'Z') { | |
3725 $counting{total_meCpG_count}++; | |
3726 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3727 if ($read_identity == 1){ | |
3728 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3729 } | |
3730 else{ | |
3731 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3732 } | |
3733 } | |
3734 elsif ($methylation_calls[$index] eq 'z') { | |
3735 $counting{total_unmethylated_CpG_count}++; | |
3736 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3737 if ($read_identity == 1){ | |
3738 $mbias_1{CpG}->{$index+1}->{un}++; | |
3739 } | |
3740 else{ | |
3741 $mbias_2{CpG}->{$index+1}->{un}++; | |
3742 } | |
3743 } | |
3744 elsif ($methylation_calls[$index] eq 'H') { | |
3745 $counting{total_meCHH_count}++; | |
3746 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3747 if ($read_identity == 1){ | |
3748 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3749 } | |
3750 else{ | |
3751 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3752 } | |
3753 } | |
3754 elsif ($methylation_calls[$index] eq 'h') { | |
3755 $counting{total_unmethylated_CHH_count}++; | |
3756 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3757 if ($read_identity == 1){ | |
3758 $mbias_1{CHH}->{$index+1}->{un}++; | |
3759 } | |
3760 else{ | |
3761 $mbias_2{CHH}->{$index+1}->{un}++; | |
3762 } | |
3763 } | |
3764 elsif ($methylation_calls[$index] eq '.') {} | |
3765 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3766 else{ | |
3767 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3768 } | |
3769 } | |
3770 } else { | |
3771 die "The strand orientation as neither + nor -: '$strand'\n"; | |
3772 } | |
3773 } | |
3774 | |
3775 ### strand-specific methylation output | |
3776 else { | |
3777 if ($strand eq '+') { | |
3778 for my $index (0..$#methylation_calls) { | |
3779 | |
3780 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3781 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3782 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
3783 $cigar_offset += $cigar_mod; | |
3784 $pos_offset += $pos_mod; | |
3785 } | |
3786 | |
3787 ### Returning as soon as the methylation calls start overlapping | |
3788 if ($start+$index+$pos_offset >= $end_read_1) { | |
3789 return; | |
3790 } | |
3791 | |
3792 if ($methylation_calls[$index] eq 'X') { | |
3793 $counting{total_meCHG_count}++; | |
3794 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3795 if ($read_identity == 1){ | |
3796 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3797 } | |
3798 else{ | |
3799 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3800 } | |
3801 } | |
3802 elsif ($methylation_calls[$index] eq 'x') { | |
3803 $counting{total_unmethylated_CHG_count}++; | |
3804 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3805 if ($read_identity == 1){ | |
3806 $mbias_1{CHG}->{$index+1}->{un}++; | |
3807 } | |
3808 else{ | |
3809 $mbias_2{CHG}->{$index+1}->{un}++; | |
3810 } | |
3811 } | |
3812 elsif ($methylation_calls[$index] eq 'Z') { | |
3813 $counting{total_meCpG_count}++; | |
3814 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3815 if ($read_identity == 1){ | |
3816 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3817 } | |
3818 else{ | |
3819 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3820 } | |
3821 } | |
3822 elsif ($methylation_calls[$index] eq 'z') { | |
3823 $counting{total_unmethylated_CpG_count}++; | |
3824 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3825 if ($read_identity == 1){ | |
3826 $mbias_1{CpG}->{$index+1}->{un}++; | |
3827 } | |
3828 else{ | |
3829 $mbias_2{CpG}->{$index+1}->{un}++; | |
3830 } | |
3831 } | |
3832 elsif ($methylation_calls[$index] eq 'H') { | |
3833 $counting{total_meCHH_count}++; | |
3834 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3835 if ($read_identity == 1){ | |
3836 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3837 } | |
3838 else{ | |
3839 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3840 } | |
3841 } | |
3842 elsif ($methylation_calls[$index] eq 'h') { | |
3843 $counting{total_unmethylated_CHH_count}++; | |
3844 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3845 if ($read_identity == 1){ | |
3846 $mbias_1{CHH}->{$index+1}->{un}++; | |
3847 } | |
3848 else{ | |
3849 $mbias_2{CHH}->{$index+1}->{un}++; | |
3850 } | |
3851 } | |
3852 elsif ($methylation_calls[$index] eq '.') {} | |
3853 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3854 else{ | |
3855 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3856 } | |
3857 } | |
3858 } elsif ($strand eq '-') { | |
3859 for my $index (0..$#methylation_calls) { | |
3860 | |
3861 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3862 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
3863 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3864 $cigar_offset += $cigar_mod; | |
3865 $pos_offset += $pos_mod; | |
3866 } | |
3867 | |
3868 ### Returning as soon as the methylation calls start overlapping | |
3869 if ($start-$index+$pos_offset <= $end_read_1) { | |
3870 return; | |
3871 } | |
3872 | |
3873 if ($methylation_calls[$index] eq 'X') { | |
3874 $counting{total_meCHG_count}++; | |
3875 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3876 if ($read_identity == 1){ | |
3877 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3878 } | |
3879 else{ | |
3880 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3881 } | |
3882 } | |
3883 elsif ($methylation_calls[$index] eq 'x') { | |
3884 $counting{total_unmethylated_CHG_count}++; | |
3885 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3886 if ($read_identity == 1){ | |
3887 $mbias_1{CHG}->{$index+1}->{un}++; | |
3888 } | |
3889 else{ | |
3890 $mbias_2{CHG}->{$index+1}->{un}++; | |
3891 } | |
3892 } | |
3893 elsif ($methylation_calls[$index] eq 'Z') { | |
3894 $counting{total_meCpG_count}++; | |
3895 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3896 if ($read_identity == 1){ | |
3897 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3898 } | |
3899 else{ | |
3900 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3901 } | |
3902 } | |
3903 elsif ($methylation_calls[$index] eq 'z') { | |
3904 $counting{total_unmethylated_CpG_count}++; | |
3905 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3906 if ($read_identity == 1){ | |
3907 $mbias_1{CpG}->{$index+1}->{un}++; | |
3908 } | |
3909 else{ | |
3910 $mbias_2{CpG}->{$index+1}->{un}++; | |
3911 } | |
3912 } | |
3913 elsif ($methylation_calls[$index] eq 'H') { | |
3914 $counting{total_meCHH_count}++; | |
3915 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3916 if ($read_identity == 1){ | |
3917 $mbias_1{CHH}->{$index+1}->{meth}++; | |
3918 } | |
3919 else{ | |
3920 $mbias_2{CHH}->{$index+1}->{meth}++; | |
3921 } | |
3922 } | |
3923 elsif ($methylation_calls[$index] eq 'h') { | |
3924 $counting{total_unmethylated_CHH_count}++; | |
3925 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3926 if ($read_identity == 1){ | |
3927 $mbias_1{CHH}->{$index+1}->{un}++; | |
3928 } | |
3929 else{ | |
3930 $mbias_2{CHH}->{$index+1}->{un}++; | |
3931 } | |
3932 } | |
3933 elsif ($methylation_calls[$index] eq '.') {} | |
3934 elsif (lc$methylation_calls[$index] eq 'u'){} | |
3935 else{ | |
3936 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
3937 } | |
3938 } | |
3939 } else { | |
3940 die "The strand orientation as neither + nor -: '$strand'\n"; | |
3941 } | |
3942 } | |
3943 } | |
3944 | |
3945 ### this is the paired-end procedure allowing overlaps and using every single C position | |
3946 else { | |
3947 ### single-file CpG, CHG and CHH context output | |
3948 if ($full) { | |
3949 if ($strand eq '+') { | |
3950 for my $index (0..$#methylation_calls) { | |
3951 | |
3952 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
3953 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
3954 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
3955 $cigar_offset += $cigar_mod; | |
3956 $pos_offset += $pos_mod; | |
3957 } | |
3958 | |
3959 if ($methylation_calls[$index] eq 'X') { | |
3960 $counting{total_meCHG_count}++; | |
3961 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3962 if ($read_identity == 1){ | |
3963 $mbias_1{CHG}->{$index+1}->{meth}++; | |
3964 } | |
3965 else{ | |
3966 $mbias_2{CHG}->{$index+1}->{meth}++; | |
3967 } | |
3968 } | |
3969 elsif ($methylation_calls[$index] eq 'x') { | |
3970 $counting{total_unmethylated_CHG_count}++; | |
3971 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3972 if ($read_identity == 1){ | |
3973 $mbias_1{CHG}->{$index+1}->{un}++; | |
3974 } | |
3975 else{ | |
3976 $mbias_2{CHG}->{$index+1}->{un}++; | |
3977 } | |
3978 } | |
3979 elsif ($methylation_calls[$index] eq 'Z') { | |
3980 $counting{total_meCpG_count}++; | |
3981 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3982 if ($read_identity == 1){ | |
3983 $mbias_1{CpG}->{$index+1}->{meth}++; | |
3984 } | |
3985 else{ | |
3986 $mbias_2{CpG}->{$index+1}->{meth}++; | |
3987 } | |
3988 } | |
3989 elsif ($methylation_calls[$index] eq 'z') { | |
3990 $counting{total_unmethylated_CpG_count}++; | |
3991 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
3992 if ($read_identity == 1){ | |
3993 $mbias_1{CpG}->{$index+1}->{un}++; | |
3994 } | |
3995 else{ | |
3996 $mbias_2{CpG}->{$index+1}->{un}++; | |
3997 } | |
3998 } | |
3999 elsif ($methylation_calls[$index] eq 'H') { | |
4000 $counting{total_meCHH_count}++; | |
4001 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4002 if ($read_identity == 1){ | |
4003 $mbias_1{CHH}->{$index+1}->{meth}++; | |
4004 } | |
4005 else{ | |
4006 $mbias_2{CHH}->{$index+1}->{meth}++; | |
4007 } | |
4008 } | |
4009 elsif ($methylation_calls[$index] eq 'h') { | |
4010 $counting{total_unmethylated_CHH_count}++; | |
4011 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4012 if ($read_identity == 1){ | |
4013 $mbias_1{CHH}->{$index+1}->{un}++; | |
4014 } | |
4015 else{ | |
4016 $mbias_2{CHH}->{$index+1}->{un}++; | |
4017 } | |
4018 } | |
4019 elsif ($methylation_calls[$index] eq '.') {} | |
4020 elsif (lc$methylation_calls[$index] eq 'u'){} | |
4021 else{ | |
4022 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
4023 } | |
4024 } | |
4025 } elsif ($strand eq '-') { | |
4026 for my $index (0..$#methylation_calls) { | |
4027 | |
4028 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
4029 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
4030 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
4031 $cigar_offset += $cigar_mod; | |
4032 $pos_offset += $pos_mod; | |
4033 } | |
4034 | |
4035 if ($methylation_calls[$index] eq 'X') { | |
4036 $counting{total_meCHG_count}++; | |
4037 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4038 if ($read_identity == 1){ | |
4039 $mbias_1{CHG}->{$index+1}->{meth}++; | |
4040 } | |
4041 else{ | |
4042 $mbias_2{CHG}->{$index+1}->{meth}++; | |
4043 } | |
4044 } | |
4045 elsif ($methylation_calls[$index] eq 'x') { | |
4046 $counting{total_unmethylated_CHG_count}++; | |
4047 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4048 if ($read_identity == 1){ | |
4049 $mbias_1{CHG}->{$index+1}->{un}++; | |
4050 } | |
4051 else{ | |
4052 $mbias_2{CHG}->{$index+1}->{un}++; | |
4053 } | |
4054 } | |
4055 elsif ($methylation_calls[$index] eq 'Z') { | |
4056 $counting{total_meCpG_count}++; | |
4057 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4058 if ($read_identity == 1){ | |
4059 $mbias_1{CpG}->{$index+1}->{meth}++; | |
4060 } | |
4061 else{ | |
4062 $mbias_2{CpG}->{$index+1}->{meth}++; | |
4063 } | |
4064 } | |
4065 elsif ($methylation_calls[$index] eq 'z') { | |
4066 $counting{total_unmethylated_CpG_count}++; | |
4067 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4068 if ($read_identity == 1){ | |
4069 $mbias_1{CpG}->{$index+1}->{un}++; | |
4070 } | |
4071 else{ | |
4072 $mbias_2{CpG}->{$index+1}->{un}++; | |
4073 } | |
4074 } | |
4075 elsif ($methylation_calls[$index] eq 'H') { | |
4076 $counting{total_meCHH_count}++; | |
4077 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4078 if ($read_identity == 1){ | |
4079 $mbias_1{CHH}->{$index+1}->{meth}++; | |
4080 } | |
4081 else{ | |
4082 $mbias_2{CHH}->{$index+1}->{meth}++; | |
4083 } | |
4084 } | |
4085 elsif ($methylation_calls[$index] eq 'h') { | |
4086 $counting{total_unmethylated_CHH_count}++; | |
4087 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4088 if ($read_identity == 1){ | |
4089 $mbias_1{CHH}->{$index+1}->{un}++; | |
4090 } | |
4091 else{ | |
4092 $mbias_2{CHH}->{$index+1}->{un}++; | |
4093 } | |
4094 } | |
4095 elsif ($methylation_calls[$index] eq '.') {} | |
4096 elsif (lc$methylation_calls[$index] eq 'u'){} | |
4097 else{ | |
4098 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
4099 } | |
4100 } | |
4101 } else { | |
4102 die "The strand orientation as neither + nor -: '$strand'\n"; | |
4103 } | |
4104 } | |
4105 | |
4106 ### strand-specific methylation output | |
4107 else { | |
4108 if ($strand eq '+') { | |
4109 for my $index (0..$#methylation_calls) { | |
4110 | |
4111 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
4112 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
4113 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
4114 $cigar_offset += $cigar_mod; | |
4115 $pos_offset += $pos_mod; | |
4116 } | |
4117 | |
4118 if ($methylation_calls[$index] eq 'X') { | |
4119 $counting{total_meCHG_count}++; | |
4120 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4121 if ($read_identity == 1){ | |
4122 $mbias_1{CHG}->{$index+1}->{meth}++; | |
4123 } | |
4124 else{ | |
4125 $mbias_2{CHG}->{$index+1}->{meth}++; | |
4126 } | |
4127 } | |
4128 elsif ($methylation_calls[$index] eq 'x') { | |
4129 $counting{total_unmethylated_CHG_count}++; | |
4130 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4131 if ($read_identity == 1){ | |
4132 $mbias_1{CHG}->{$index+1}->{un}++; | |
4133 } | |
4134 else{ | |
4135 $mbias_2{CHG}->{$index+1}->{un}++; | |
4136 } | |
4137 } | |
4138 elsif ($methylation_calls[$index] eq 'Z') { | |
4139 $counting{total_meCpG_count}++; | |
4140 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4141 if ($read_identity == 1){ | |
4142 $mbias_1{CpG}->{$index+1}->{meth}++; | |
4143 } | |
4144 else{ | |
4145 $mbias_2{CpG}->{$index+1}->{meth}++; | |
4146 } | |
4147 } | |
4148 elsif ($methylation_calls[$index] eq 'z') { | |
4149 $counting{total_unmethylated_CpG_count}++; | |
4150 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4151 if ($read_identity == 1){ | |
4152 $mbias_1{CpG}->{$index+1}->{un}++; | |
4153 } | |
4154 else{ | |
4155 $mbias_2{CpG}->{$index+1}->{un}++; | |
4156 } | |
4157 } | |
4158 elsif ($methylation_calls[$index] eq 'H') { | |
4159 $counting{total_meCHH_count}++; | |
4160 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4161 if ($read_identity == 1){ | |
4162 $mbias_1{CHH}->{$index+1}->{meth}++; | |
4163 } | |
4164 else{ | |
4165 $mbias_2{CHH}->{$index+1}->{meth}++; | |
4166 } | |
4167 } | |
4168 elsif ($methylation_calls[$index] eq 'h') { | |
4169 $counting{total_unmethylated_CHH_count}++; | |
4170 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4171 if ($read_identity == 1){ | |
4172 $mbias_1{CHH}->{$index+1}->{un}++; | |
4173 } | |
4174 else{ | |
4175 $mbias_2{CHH}->{$index+1}->{un}++; | |
4176 } | |
4177 } | |
4178 elsif ($methylation_calls[$index] eq '.') {} | |
4179 elsif (lc$methylation_calls[$index] eq 'u'){} | |
4180 else{ | |
4181 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
4182 } | |
4183 } | |
4184 } elsif ($strand eq '-') { | |
4185 for my $index (0..$#methylation_calls) { | |
4186 | |
4187 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels | |
4188 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
4189 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
4190 $cigar_offset += $cigar_mod; | |
4191 $pos_offset += $pos_mod; | |
4192 } | |
4193 | |
4194 if ($methylation_calls[$index] eq 'X') { | |
4195 $counting{total_meCHG_count}++; | |
4196 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4197 if ($read_identity == 1){ | |
4198 $mbias_1{CHG}->{$index+1}->{meth}++; | |
4199 } | |
4200 else{ | |
4201 $mbias_2{CHG}->{$index+1}->{meth}++; | |
4202 } | |
4203 } | |
4204 elsif ($methylation_calls[$index] eq 'x') { | |
4205 $counting{total_unmethylated_CHG_count}++; | |
4206 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4207 if ($read_identity == 1){ | |
4208 $mbias_1{CHG}->{$index+1}->{un}++; | |
4209 } | |
4210 else{ | |
4211 $mbias_2{CHG}->{$index+1}->{un}++; | |
4212 } | |
4213 } | |
4214 elsif ($methylation_calls[$index] eq 'Z') { | |
4215 $counting{total_meCpG_count}++; | |
4216 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4217 if ($read_identity == 1){ | |
4218 $mbias_1{CpG}->{$index+1}->{meth}++; | |
4219 } | |
4220 else{ | |
4221 $mbias_2{CpG}->{$index+1}->{meth}++; | |
4222 } | |
4223 } | |
4224 elsif ($methylation_calls[$index] eq 'z') { | |
4225 $counting{total_unmethylated_CpG_count}++; | |
4226 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4227 if ($read_identity == 1){ | |
4228 $mbias_1{CpG}->{$index+1}->{un}++; | |
4229 } | |
4230 else{ | |
4231 $mbias_2{CpG}->{$index+1}->{un}++; | |
4232 } | |
4233 } | |
4234 elsif ($methylation_calls[$index] eq 'H') { | |
4235 $counting{total_meCHH_count}++; | |
4236 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4237 if ($read_identity == 1){ | |
4238 $mbias_1{CHH}->{$index+1}->{meth}++; | |
4239 } | |
4240 else{ | |
4241 $mbias_2{CHH}->{$index+1}->{meth}++; | |
4242 } | |
4243 } | |
4244 elsif ($methylation_calls[$index] eq 'h') { | |
4245 $counting{total_unmethylated_CHH_count}++; | |
4246 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4247 if ($read_identity == 1){ | |
4248 $mbias_1{CHH}->{$index+1}->{un}++; | |
4249 } | |
4250 else{ | |
4251 $mbias_2{CHH}->{$index+1}->{un}++; | |
4252 } | |
4253 } | |
4254 elsif ($methylation_calls[$index] eq '.') {} | |
4255 elsif (lc$methylation_calls[$index] eq 'u'){} | |
4256 else{ | |
4257 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
4258 } | |
4259 } | |
4260 } else { | |
4261 die "The strand orientation as neither + nor -: '$strand'\n"; | |
4262 } | |
4263 } | |
4264 } | |
4265 } | |
4266 | |
4267 sub check_cigar_string { | |
4268 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; | |
4269 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; | |
4270 my ($new_cigar_offset,$new_pos_offset) = (0,0); | |
4271 | |
4272 if ($strand eq '+') { | |
4273 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
4274 | |
4275 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
4276 # warn "position needs no adjustment\n"; | |
4277 } | |
4278 | |
4279 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
4280 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
4281 # warn "adjusted genomic position by -1 bp (insertion)\n"; | |
4282 } | |
4283 | |
4284 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
4285 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
4286 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
4287 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
4288 | |
4289 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
4290 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
4291 # warn "position needs no adjustment\n"; | |
4292 last; | |
4293 } | |
4294 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
4295 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position | |
4296 # warn "adjusted genomic position by another -1 bp (insertion)\n"; | |
4297 last; | |
4298 } | |
4299 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
4300 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
4301 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position | |
4302 # warn "adjusted genomic position by another +1 bp (deletion)\n"; | |
4303 } | |
4304 else{ | |
4305 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
4306 } | |
4307 } | |
4308 } | |
4309 else{ | |
4310 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
4311 } | |
4312 } | |
4313 | |
4314 elsif ($strand eq '-') { | |
4315 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; | |
4316 | |
4317 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
4318 # warn "position needs no adjustment\n"; | |
4319 } | |
4320 | |
4321 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence | |
4322 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position | |
4323 # warn "adjusted genomic position by +1 bp (insertion)\n"; | |
4324 } | |
4325 | |
4326 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
4327 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
4328 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
4329 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; | |
4330 | |
4331 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ | |
4332 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position | |
4333 # warn "Found new 'M' operation; position needs no adjustment\n"; | |
4334 last; | |
4335 } | |
4336 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ | |
4337 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position | |
4338 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; | |
4339 last; | |
4340 } | |
4341 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence | |
4342 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index | |
4343 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position | |
4344 # warn "adjusted genomic position by another -1 bp (deletion)\n"; | |
4345 } | |
4346 else{ | |
4347 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
4348 } | |
4349 } | |
4350 } | |
4351 else{ | |
4352 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; | |
4353 } | |
4354 } | |
4355 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; | |
4356 return ($new_cigar_offset,$new_pos_offset); | |
4357 } | |
4358 | |
4359 sub print_individual_C_methylation_states_single_end{ | |
4360 | |
4361 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; | |
4362 my @methylation_calls = split(//,$meth_call); | |
4363 | |
4364 ################################################################# | |
4365 ### . for bases not involving cytosines ### | |
4366 ### X for methylated C in CHG context (was protected) ### | |
4367 ### x for not methylated C in CHG context (was converted) ### | |
4368 ### H for methylated C in CHH context (was protected) ### | |
4369 ### h for not methylated C in CHH context (was converted) ### | |
4370 ### Z for methylated C in CpG context (was protected) ### | |
4371 ### z for not methylated C in CpG context (was converted) ### | |
4372 ################################################################# | |
4373 | |
4374 my $methyl_CHG_count = 0; | |
4375 my $methyl_CHH_count = 0; | |
4376 my $methyl_CpG_count = 0; | |
4377 my $unmethylated_CHG_count = 0; | |
4378 my $unmethylated_CHH_count = 0; | |
4379 my $unmethylated_CpG_count = 0; | |
4380 | |
4381 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions | |
4382 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels | |
4383 | |
4384 my @comp_cigar; | |
4385 | |
4386 if ($cigar){ # parsing CIGAR string | |
4387 | |
4388 ### Checking whether the CIGAR string is a linear genomic match or whether it requires indel processing | |
4389 if ($cigar =~ /^\d+M$/){ | |
4390 # warn "See!? I told you so! $cigar\n"; | |
4391 # sleep(1); | |
4392 } | |
4393 else{ | |
4394 | |
4395 my @len; | |
4396 my @ops; | |
4397 | |
4398 @len = split (/\D+/,$cigar); # storing the length per operation | |
4399 @ops = split (/\d+/,$cigar); # storing the operation | |
4400 shift @ops; # remove the empty first element | |
4401 # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops); | |
4402 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
4403 | |
4404 foreach my $index (0..$#len){ | |
4405 foreach (1..$len[$index]){ | |
4406 # print "$ops[$index]"; | |
4407 push @comp_cigar, $ops[$index]; | |
4408 } | |
4409 } | |
4410 } | |
4411 # warn "\nDetected CIGAR string: $cigar\n"; | |
4412 # warn "Length of methylation call: ",length $meth_call,"\n"; | |
4413 # warn "number of operations: ",scalar @ops,"\n"; | |
4414 # warn "number of length digits: ",scalar @len,"\n\n"; | |
4415 # print @comp_cigar,"\n"; | |
4416 # print "$meth_call\n\n"; | |
4417 # sleep (1); | |
4418 } | |
4419 | |
4420 ### adjusting the start position for all reads mapping to the reverse strand | |
4421 if ($strand eq '-') { | |
4422 | |
4423 if (@comp_cigar){ # only needed for SAM reads with InDels | |
4424 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too | |
4425 # print @comp_cigar,"\n"; | |
4426 } | |
4427 | |
4428 unless ($ignore){ ### if --ignore was specified the start position has already been corrected | |
4429 | |
4430 if ($cigar){ ### SAM format | |
4431 if ($cigar =~ /^(\d+)M$/){ # linear match | |
4432 $start += $1 - 1; | |
4433 } | |
4434 else{ # InDel read | |
4435 my $MD_count = 0; | |
4436 foreach (@comp_cigar){ | |
4437 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't | |
4438 } | |
4439 $start += $MD_count - 1; | |
4440 } | |
4441 } | |
4442 else{ ### vanilla format | |
4443 $start += length($meth_call)-1; | |
4444 } | |
4445 } | |
4446 } | |
4447 | |
4448 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL) | |
4449 | |
4450 ### single-file CpG and other-context output | |
4451 if ($full and $merge_non_CpG) { | |
4452 if ($strand eq '+') { | |
4453 for my $index (0..$#methylation_calls) { | |
4454 | |
4455 if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels | |
4456 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
4457 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; | |
4458 $cigar_offset += $cigar_mod; | |
4459 $pos_offset += $pos_mod; | |
4460 } | |
4461 | |
4462 ### methylated Cs (any context) will receive a forward (+) orientation | |
4463 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
4464 if ($methylation_calls[$index] eq 'X') { | |
4465 $counting{total_meCHG_count}++; | |
4466 my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; | |
4467 print {$fhs{other_context}} $line unless($mbias_only); | |
4468 # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4469 $mbias_1{CHG}->{$index+1}->{meth}++; | |
4470 } | |
4471 elsif ($methylation_calls[$index] eq 'x') { | |
4472 $counting{total_unmethylated_CHG_count}++; | |
4473 my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; | |
4474 print {$fhs{other_context}} $line unless($mbias_only); | |
4475 # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4476 $mbias_1{CHG}->{$index+1}->{un}++; | |
4477 } | |
4478 elsif ($methylation_calls[$index] eq 'Z') { | |
4479 $counting{total_meCpG_count}++; | |
4480 my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; | |
4481 print {$fhs{CpG_context}} $line unless($mbias_only); | |
4482 # print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4483 $mbias_1{CpG}->{$index+1}->{meth}++; | |
4484 } | |
4485 elsif ($methylation_calls[$index] eq 'z') { | |
4486 $counting{total_unmethylated_CpG_count}++; | |
4487 my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; | |
4488 print {$fhs{CpG_context}} $line unless($mbias_only); | |
4489 # print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4490 $mbias_1{CpG}->{$index+1}->{un}++; | |
4491 } | |
4492 elsif ($methylation_calls[$index] eq 'H') { | |
4493 $counting{total_meCHH_count}++; | |
4494 my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; | |
4495 print {$fhs{other_context}} $line unless($mbias_only); | |
4496 # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4497 $mbias_1{CHH}->{$index+1}->{meth}++; | |
4498 } | |
4499 elsif ($methylation_calls[$index] eq 'h') { | |
4500 $counting{total_unmethylated_CHH_count}++; | |
4501 my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; | |
4502 print {$fhs{other_context}} $line unless($mbias_only); | |
4503 # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4504 $mbias_1{CHH}->{$index+1}->{un}++; | |
4505 } | |
4506 elsif ($methylation_calls[$index] eq '.') {} | |
4507 elsif (lc$methylation_calls[$index] eq 'u'){} | |
4508 else{ | |
4509 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
4510 } | |
4511 } | |
4512 } | |
4513 elsif ($strand eq '-') { | |
4514 | |
4515 for my $index (0..$#methylation_calls) { | |
4516 ### methylated Cs (any context) will receive a forward (+) orientation | |
4517 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
4518 | |
4519 if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels | |
4520 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; | |
4521 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
4522 $cigar_offset += $cigar_mod; | |
4523 $pos_offset += $pos_mod; | |
4524 } | |
4525 | |
4526 if ($methylation_calls[$index] eq 'X') { | |
4527 $counting{total_meCHG_count}++; | |
4528 my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; | |
4529 print {$fhs{other_context}} $line unless($mbias_only); | |
4530 #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4531 $mbias_1{CHG}->{$index+1}->{meth}++; | |
4532 } | |
4533 elsif ($methylation_calls[$index] eq 'x') { | |
4534 $counting{total_unmethylated_CHG_count}++; | |
4535 my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; | |
4536 print {$fhs{other_context}} $line unless($mbias_only); | |
4537 #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4538 $mbias_1{CHG}->{$index+1}->{un}++; | |
4539 } | |
4540 elsif ($methylation_calls[$index] eq 'Z') { | |
4541 $counting{total_meCpG_count}++; | |
4542 my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; | |
4543 print {$fhs{CpG_context}} $line unless($mbias_only); | |
4544 #print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4545 $mbias_1{CpG}->{$index+1}->{meth}++; | |
4546 } | |
4547 elsif ($methylation_calls[$index] eq 'z') { | |
4548 $counting{total_unmethylated_CpG_count}++; | |
4549 my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; | |
4550 print {$fhs{CpG_context}} $line unless($mbias_only); | |
4551 #print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4552 $mbias_1{CpG}->{$index+1}->{un}++; | |
4553 } | |
4554 elsif ($methylation_calls[$index] eq 'H') { | |
4555 $counting{total_meCHH_count}++; | |
4556 my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; | |
4557 print {$fhs{other_context}} $line unless($mbias_only); | |
4558 #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4559 $mbias_1{CHH}->{$index+1}->{meth}++; | |
4560 } | |
4561 elsif ($methylation_calls[$index] eq 'h') { | |
4562 $counting{total_unmethylated_CHH_count}++; | |
4563 my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; | |
4564 print {$fhs{other_context}} $line unless($mbias_only); | |
4565 #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4566 $mbias_1{CHH}->{$index+1}->{un}++; | |
4567 } | |
4568 elsif ($methylation_calls[$index] eq '.'){} | |
4569 elsif (lc$methylation_calls[$index] eq 'u'){} | |
4570 else{ | |
4571 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
4572 } | |
4573 } | |
4574 } | |
4575 else { | |
4576 die "The strand information was neither + nor -: $strand\n"; | |
4577 } | |
4578 } | |
4579 | |
4580 ### strand-specific methylation output | |
4581 elsif ($merge_non_CpG) { | |
4582 if ($strand eq '+') { | |
4583 for my $index (0..$#methylation_calls) { | |
4584 ### methylated Cs (any context) will receive a forward (+) orientation | |
4585 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
4586 | |
4587 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
4588 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
4589 $cigar_offset += $cigar_mod; | |
4590 $pos_offset += $pos_mod; | |
4591 } | |
4592 | |
4593 if ($methylation_calls[$index] eq 'X') { | |
4594 $counting{total_meCHG_count}++; | |
4595 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4596 $mbias_1{CHG}->{$index+1}->{meth}++; | |
4597 } | |
4598 elsif ($methylation_calls[$index] eq 'x') { | |
4599 $counting{total_unmethylated_CHG_count}++; | |
4600 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4601 $mbias_1{CHG}->{$index+1}->{un}++; | |
4602 } | |
4603 elsif ($methylation_calls[$index] eq 'Z') { | |
4604 $counting{total_meCpG_count}++; | |
4605 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4606 $mbias_1{CpG}->{$index+1}->{meth}++; | |
4607 } | |
4608 elsif ($methylation_calls[$index] eq 'z') { | |
4609 $counting{total_unmethylated_CpG_count}++; | |
4610 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4611 $mbias_1{CpG}->{$index+1}->{un}++; | |
4612 } | |
4613 elsif ($methylation_calls[$index] eq 'H') { | |
4614 $counting{total_meCHH_count}++; | |
4615 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4616 $mbias_1{CHH}->{$index+1}->{meth}++; | |
4617 } | |
4618 elsif ($methylation_calls[$index] eq 'h') { | |
4619 $counting{total_unmethylated_CHH_count}++; | |
4620 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4621 $mbias_1{CHH}->{$index+1}->{un}++; | |
4622 } | |
4623 elsif ($methylation_calls[$index] eq '.') {} | |
4624 elsif (lc$methylation_calls[$index] eq 'u'){} | |
4625 else{ | |
4626 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
4627 } | |
4628 } | |
4629 } | |
4630 elsif ($strand eq '-') { | |
4631 | |
4632 for my $index (0..$#methylation_calls) { | |
4633 ### methylated Cs (any context) will receive a forward (+) orientation | |
4634 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
4635 | |
4636 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
4637 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
4638 $cigar_offset += $cigar_mod; | |
4639 $pos_offset += $pos_mod; | |
4640 } | |
4641 | |
4642 if ($methylation_calls[$index] eq 'X') { | |
4643 $counting{total_meCHG_count}++; | |
4644 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4645 $mbias_1{CHG}->{$index+1}->{meth}++; | |
4646 } | |
4647 elsif ($methylation_calls[$index] eq 'x') { | |
4648 $counting{total_unmethylated_CHG_count}++; | |
4649 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4650 $mbias_1{CHG}->{$index+1}->{un}++; | |
4651 } | |
4652 elsif ($methylation_calls[$index] eq 'Z') { | |
4653 $counting{total_meCpG_count}++; | |
4654 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4655 $mbias_1{CpG}->{$index+1}->{meth}++; | |
4656 } | |
4657 elsif ($methylation_calls[$index] eq 'z') { | |
4658 $counting{total_unmethylated_CpG_count}++; | |
4659 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4660 $mbias_1{CpG}->{$index+1}->{un}++; | |
4661 } | |
4662 elsif ($methylation_calls[$index] eq 'H') { | |
4663 $counting{total_meCHH_count}++; | |
4664 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4665 $mbias_1{CHH}->{$index+1}->{meth}++; | |
4666 } | |
4667 elsif ($methylation_calls[$index] eq 'h') { | |
4668 $counting{total_unmethylated_CHH_count}++; | |
4669 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4670 $mbias_1{CHH}->{$index+1}->{un}++; | |
4671 } | |
4672 elsif ($methylation_calls[$index] eq '.') {} | |
4673 elsif (lc$methylation_calls[$index] eq 'u'){} | |
4674 else{ | |
4675 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
4676 } | |
4677 } | |
4678 } | |
4679 else { | |
4680 die "The strand information was neither + nor -: $strand\n"; | |
4681 } | |
4682 } | |
4683 | |
4684 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION | |
4685 | |
4686 elsif ($full) { | |
4687 if ($strand eq '+') { | |
4688 for my $index (0..$#methylation_calls) { | |
4689 ### methylated Cs (any context) will receive a forward (+) orientation | |
4690 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
4691 | |
4692 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
4693 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
4694 $cigar_offset += $cigar_mod; | |
4695 $pos_offset += $pos_mod; | |
4696 } | |
4697 | |
4698 if ($methylation_calls[$index] eq 'X') { | |
4699 $counting{total_meCHG_count}++; | |
4700 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4701 $mbias_1{CHG}->{$index+1}->{meth}++; | |
4702 } | |
4703 elsif ($methylation_calls[$index] eq 'x') { | |
4704 $counting{total_unmethylated_CHG_count}++; | |
4705 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4706 $mbias_1{CHG}->{$index+1}->{un}++; | |
4707 } | |
4708 elsif ($methylation_calls[$index] eq 'Z') { | |
4709 $counting{total_meCpG_count}++; | |
4710 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4711 $mbias_1{CpG}->{$index+1}->{meth}++; | |
4712 } | |
4713 elsif ($methylation_calls[$index] eq 'z') { | |
4714 $counting{total_unmethylated_CpG_count}++; | |
4715 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4716 $mbias_1{CpG}->{$index+1}->{un}++; | |
4717 } | |
4718 elsif ($methylation_calls[$index] eq 'H') { | |
4719 $counting{total_meCHH_count}++; | |
4720 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4721 $mbias_1{CHH}->{$index+1}->{meth}++; | |
4722 } | |
4723 elsif ($methylation_calls[$index] eq 'h') { | |
4724 $counting{total_unmethylated_CHH_count}++; | |
4725 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4726 $mbias_1{CHH}->{$index+1}->{un}++; | |
4727 } | |
4728 elsif ($methylation_calls[$index] eq '.') {} | |
4729 elsif (lc$methylation_calls[$index] eq 'u'){} | |
4730 else{ | |
4731 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); | |
4732 } | |
4733 } | |
4734 } | |
4735 elsif ($strand eq '-') { | |
4736 | |
4737 for my $index (0..$#methylation_calls) { | |
4738 ### methylated Cs (any context) will receive a forward (+) orientation | |
4739 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
4740 | |
4741 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
4742 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
4743 $cigar_offset += $cigar_mod; | |
4744 $pos_offset += $pos_mod; | |
4745 } | |
4746 | |
4747 if ($methylation_calls[$index] eq 'X') { | |
4748 $counting{total_meCHG_count}++; | |
4749 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4750 $mbias_1{CHG}->{$index+1}->{meth}++; | |
4751 } | |
4752 elsif ($methylation_calls[$index] eq 'x') { | |
4753 $counting{total_unmethylated_CHG_count}++; | |
4754 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4755 $mbias_1{CHG}->{$index+1}->{un}++; | |
4756 } | |
4757 elsif ($methylation_calls[$index] eq 'Z') { | |
4758 $counting{total_meCpG_count}++; | |
4759 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4760 $mbias_1{CpG}->{$index+1}->{meth}++; | |
4761 } | |
4762 elsif ($methylation_calls[$index] eq 'z') { | |
4763 $counting{total_unmethylated_CpG_count}++; | |
4764 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4765 $mbias_1{CpG}->{$index+1}->{un}++; | |
4766 } | |
4767 elsif ($methylation_calls[$index] eq 'H') { | |
4768 $counting{total_meCHH_count}++; | |
4769 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4770 $mbias_1{CHH}->{$index+1}->{meth}++; | |
4771 } | |
4772 elsif ($methylation_calls[$index] eq 'h') { | |
4773 $counting{total_unmethylated_CHH_count}++; | |
4774 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4775 $mbias_1{CHH}->{$index+1}->{un}++; | |
4776 } | |
4777 elsif ($methylation_calls[$index] eq '.') {} | |
4778 elsif (lc$methylation_calls[$index] eq 'u'){} | |
4779 else{ | |
4780 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
4781 } | |
4782 } | |
4783 } | |
4784 else { | |
4785 die "The read had a strand orientation which was neither + nor -: $strand\n"; | |
4786 } | |
4787 } | |
4788 | |
4789 ### strand-specific methylation output | |
4790 else { | |
4791 if ($strand eq '+') { | |
4792 for my $index (0..$#methylation_calls) { | |
4793 ### methylated Cs (any context) will receive a forward (+) orientation | |
4794 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
4795 | |
4796 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
4797 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
4798 $cigar_offset += $cigar_mod; | |
4799 $pos_offset += $pos_mod; | |
4800 } | |
4801 | |
4802 if ($methylation_calls[$index] eq 'X') { | |
4803 $counting{total_meCHG_count}++; | |
4804 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4805 $mbias_1{CHG}->{$index+1}->{meth}++; | |
4806 } | |
4807 elsif ($methylation_calls[$index] eq 'x') { | |
4808 $counting{total_unmethylated_CHG_count}++; | |
4809 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4810 $mbias_1{CHG}->{$index+1}->{un}++; | |
4811 } | |
4812 elsif ($methylation_calls[$index] eq 'Z') { | |
4813 $counting{total_meCpG_count}++; | |
4814 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4815 $mbias_1{CpG}->{$index+1}->{meth}++; | |
4816 } | |
4817 elsif ($methylation_calls[$index] eq 'z') { | |
4818 $counting{total_unmethylated_CpG_count}++; | |
4819 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4820 $mbias_1{CpG}->{$index+1}->{un}++; | |
4821 } | |
4822 elsif ($methylation_calls[$index] eq 'H') { | |
4823 $counting{total_meCHH_count}++; | |
4824 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4825 $mbias_1{CHH}->{$index+1}->{meth}++; | |
4826 } | |
4827 elsif ($methylation_calls[$index] eq 'h') { | |
4828 $counting{total_unmethylated_CHH_count}++; | |
4829 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4830 $mbias_1{CHH}->{$index+1}->{un}++; | |
4831 } | |
4832 elsif ($methylation_calls[$index] eq '.') {} | |
4833 elsif (lc$methylation_calls[$index] eq 'u'){} | |
4834 else{ | |
4835 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
4836 } | |
4837 } | |
4838 } | |
4839 elsif ($strand eq '-') { | |
4840 | |
4841 for my $index (0..$#methylation_calls) { | |
4842 ### methylated Cs (any context) will receive a forward (+) orientation | |
4843 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
4844 | |
4845 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels | |
4846 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); | |
4847 $cigar_offset += $cigar_mod; | |
4848 $pos_offset += $pos_mod; | |
4849 } | |
4850 | |
4851 if ($methylation_calls[$index] eq 'X') { | |
4852 $counting{total_meCHG_count}++; | |
4853 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4854 $mbias_1{CHG}->{$index+1}->{meth}++; | |
4855 } | |
4856 elsif ($methylation_calls[$index] eq 'x') { | |
4857 $counting{total_unmethylated_CHG_count}++; | |
4858 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4859 $mbias_1{CHG}->{$index+1}->{un}++; | |
4860 } | |
4861 elsif ($methylation_calls[$index] eq 'Z') { | |
4862 $counting{total_meCpG_count}++; | |
4863 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4864 $mbias_1{CpG}->{$index+1}->{meth}++; | |
4865 } | |
4866 elsif ($methylation_calls[$index] eq 'z') { | |
4867 $counting{total_unmethylated_CpG_count}++; | |
4868 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4869 $mbias_1{CpG}->{$index+1}->{un}++; | |
4870 } | |
4871 elsif ($methylation_calls[$index] eq 'H') { | |
4872 $counting{total_meCHH_count}++; | |
4873 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4874 $mbias_1{CHH}->{$index+1}->{meth}++; | |
4875 } | |
4876 elsif ($methylation_calls[$index] eq 'h') { | |
4877 $counting{total_unmethylated_CHH_count}++; | |
4878 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); | |
4879 $mbias_1{CHH}->{$index+1}->{un}++; | |
4880 } | |
4881 elsif ($methylation_calls[$index] eq '.') {} | |
4882 elsif (lc$methylation_calls[$index] eq 'u'){} | |
4883 else{ | |
4884 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
4885 } | |
4886 } | |
4887 } | |
4888 else { | |
4889 die "The strand information was neither + nor -: $strand\n"; | |
4890 } | |
4891 } | |
4892 } | |
4893 | |
4894 sub open_output_filehandles{ | |
4895 | |
4896 my $filename = shift; | |
4897 | |
4898 my $output_filename = (split (/\//,$filename))[-1]; | |
4899 my $report_filename = $output_filename; | |
4900 | |
4901 ### OPENING OUTPUT-FILEHANDLES | |
4902 if ($report) { | |
4903 $report_filename =~ s/\.sam$//; | |
4904 $report_filename =~ s/\.bam$//; | |
4905 $report_filename =~ s/\.cram$//; | |
4906 $report_filename =~ s/\.txt$//; | |
4907 $report_filename =~ s/$/_splitting_report.txt/; | |
4908 $report_filename = $output_dir . $report_filename; | |
4909 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n"; | |
4910 } | |
4911 | |
4912 if ($report) { | |
4913 | |
4914 print REPORT "$output_filename\n\n"; | |
4915 print REPORT "Parameters used to extract methylation information:\n"; | |
4916 print REPORT "Bismark Extractor Version: $version\n"; | |
4917 | |
4918 if ($paired) { | |
4919 if ($vanilla) { | |
4920 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n"; | |
4921 } else { | |
4922 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default | |
4923 } | |
4924 } | |
4925 | |
4926 if ($single) { | |
4927 if ($vanilla) { | |
4928 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; | |
4929 } else { | |
4930 print REPORT "Bismark result file: single-end (SAM format)\n"; # default | |
4931 } | |
4932 } | |
4933 if ($single){ | |
4934 if ($ignore) { | |
4935 print REPORT "Ignoring first $ignore bp\n"; | |
4936 } | |
4937 if ($ignore_3prime) { | |
4938 print REPORT "Ignoring last $ignore_3prime bp\n"; | |
4939 } | |
4940 } | |
4941 else{ # paired-end | |
4942 if ($ignore) { | |
4943 print REPORT "Ignoring first $ignore bp of Read 1\n"; | |
4944 } | |
4945 if ($ignore_r2){ | |
4946 print REPORT "Ignoring first $ignore_r2 bp of Read 2\n"; | |
4947 } | |
4948 | |
4949 if ($ignore_3prime) { | |
4950 print REPORT "Ignoring last $ignore_3prime bp of Read 1\n"; | |
4951 } | |
4952 if ($ignore_3prime_r2){ | |
4953 print REPORT "Ignoring last $ignore_3prime_r2 bp of Read 2\n"; | |
4954 } | |
4955 | |
4956 } | |
4957 | |
4958 if ($full) { | |
4959 print REPORT "Output specified: comprehensive\n"; | |
4960 } else { | |
4961 print REPORT "Output specified: strand-specific (default)\n"; | |
4962 } | |
4963 | |
4964 if ($no_overlap) { | |
4965 print REPORT "No overlapping methylation calls specified\n"; | |
4966 } | |
4967 if ($genomic_fasta) { | |
4968 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n"; | |
4969 } | |
4970 if ($merge_non_CpG) { | |
4971 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n"; | |
4972 } | |
4973 | |
4974 print REPORT "\n"; | |
4975 } | |
4976 | |
4977 ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
4978 | |
4979 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL | |
4980 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file | |
4981 if ($full and $merge_non_CpG) { | |
4982 my $cpg_output = my $other_c_output = $output_filename; | |
4983 ### C in CpG context | |
4984 $cpg_output =~ s/^/CpG_context_/; | |
4985 $cpg_output =~ s/sam$/txt/; | |
4986 $cpg_output =~ s/bam$/txt/; | |
4987 $cpg_output =~ s/cram$/txt/; | |
4988 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); | |
4989 $cpg_output = $output_dir . $cpg_output; | |
4990 | |
4991 if ($gzip){ | |
4992 $cpg_output .= '.gz'; | |
4993 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); | |
4994 } | |
4995 else{ ### disclaimer: I am aware of "The Useless Use of Cat Awards", but I saw no other option... | |
4996 open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); | |
4997 # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); | |
4998 } | |
4999 | |
5000 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); | |
5001 push @sorting_files,$cpg_output; | |
5002 | |
5003 unless ($no_header) { | |
5004 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5005 } | |
5006 | |
5007 ### C in any other context than CpG | |
5008 $other_c_output =~ s/^/Non_CpG_context_/; | |
5009 $other_c_output =~ s/sam$/txt/; | |
5010 $other_c_output =~ s/bam$/txt/; | |
5011 $other_c_output =~ s/cram$/txt/; | |
5012 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); | |
5013 $other_c_output = $output_dir . $other_c_output; | |
5014 | |
5015 if ($gzip){ | |
5016 $other_c_output .= '.gz'; | |
5017 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only); | |
5018 } | |
5019 else{ | |
5020 open ($fhs{other_context},"| cat > $other_c_output") or die "Failed to write to $other_c_output $!\n" unless($mbias_only); | |
5021 # open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only); | |
5022 } | |
5023 | |
5024 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only); | |
5025 push @sorting_files,$other_c_output; | |
5026 | |
5027 | |
5028 unless ($no_header) { | |
5029 print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5030 } | |
5031 } | |
5032 | |
5033 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found | |
5034 elsif ($merge_non_CpG) { | |
5035 | |
5036 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; | |
5037 | |
5038 ### For cytosines in CpG context | |
5039 $cpg_ot =~ s/^/CpG_OT_/; | |
5040 $cpg_ot =~ s/sam$/txt/; | |
5041 $cpg_ot =~ s/bam$/txt/; | |
5042 $cpg_ot =~ s/cram$/txt/; | |
5043 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); | |
5044 $cpg_ot = $output_dir . $cpg_ot; | |
5045 | |
5046 if ($gzip){ | |
5047 $cpg_ot .= '.gz'; | |
5048 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); | |
5049 } | |
5050 else{ | |
5051 open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); | |
5052 # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); | |
5053 } | |
5054 | |
5055 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); | |
5056 push @sorting_files,$cpg_ot; | |
5057 | |
5058 unless($no_header){ | |
5059 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5060 } | |
5061 | |
5062 $cpg_ctot =~ s/^/CpG_CTOT_/; | |
5063 $cpg_ctot =~ s/sam$/txt/; | |
5064 $cpg_ctot =~ s/bam$/txt/; | |
5065 $cpg_ctot =~ s/cram$/txt/; | |
5066 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); | |
5067 $cpg_ctot = $output_dir . $cpg_ctot; | |
5068 | |
5069 if ($gzip){ | |
5070 $cpg_ctot .= '.gz'; | |
5071 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); | |
5072 } | |
5073 else{ | |
5074 open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); | |
5075 # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); | |
5076 } | |
5077 | |
5078 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); | |
5079 push @sorting_files,$cpg_ctot; | |
5080 | |
5081 unless($no_header){ | |
5082 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5083 } | |
5084 | |
5085 $cpg_ctob =~ s/^/CpG_CTOB_/; | |
5086 $cpg_ctob =~ s/sam$/txt/; | |
5087 $cpg_ctob =~ s/bam$/txt/; | |
5088 $cpg_ctob =~ s/cram$/txt/; | |
5089 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); | |
5090 $cpg_ctob = $output_dir . $cpg_ctob; | |
5091 | |
5092 if ($gzip){ | |
5093 $cpg_ctob .= '.gz'; | |
5094 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); | |
5095 } | |
5096 else{ | |
5097 open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); | |
5098 # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); | |
5099 } | |
5100 | |
5101 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); | |
5102 push @sorting_files,$cpg_ctob; | |
5103 | |
5104 unless($no_header){ | |
5105 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5106 } | |
5107 | |
5108 $cpg_ob =~ s/^/CpG_OB_/; | |
5109 $cpg_ob =~ s/sam$/txt/; | |
5110 $cpg_ob =~ s/bam$/txt/; | |
5111 $cpg_ob =~ s/cram$/txt/; | |
5112 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); | |
5113 $cpg_ob = $output_dir . $cpg_ob; | |
5114 | |
5115 if ($gzip){ | |
5116 $cpg_ob .= '.gz'; | |
5117 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); | |
5118 } | |
5119 else{ | |
5120 open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); | |
5121 # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); | |
5122 } | |
5123 | |
5124 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); | |
5125 push @sorting_files,$cpg_ob; | |
5126 | |
5127 unless($no_header){ | |
5128 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5129 } | |
5130 | |
5131 ### For cytosines in Non-CpG (CC, CT or CA) context | |
5132 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; | |
5133 | |
5134 $other_c_ot =~ s/^/Non_CpG_OT_/; | |
5135 $other_c_ot =~ s/sam$/txt/; | |
5136 $other_c_ot =~ s/bam$/txt/; | |
5137 $other_c_ot =~ s/cram$/txt/; | |
5138 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); | |
5139 $other_c_ot = $output_dir . $other_c_ot; | |
5140 | |
5141 if ($gzip){ | |
5142 $other_c_ot .= '.gz'; | |
5143 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); | |
5144 } | |
5145 else{ | |
5146 open ($fhs{0}->{other_c},"| cat > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); | |
5147 # open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); | |
5148 } | |
5149 | |
5150 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only); | |
5151 push @sorting_files,$other_c_ot; | |
5152 | |
5153 unless($no_header){ | |
5154 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5155 } | |
5156 | |
5157 $other_c_ctot =~ s/^/Non_CpG_CTOT_/; | |
5158 $other_c_ctot =~ s/sam$/txt/; | |
5159 $other_c_ctot =~ s/bam$/txt/; | |
5160 $other_c_ctot =~ s/cram$/txt/; | |
5161 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); | |
5162 $other_c_ctot = $output_dir . $other_c_ctot; | |
5163 | |
5164 if ($gzip){ | |
5165 $other_c_ctot .= '.gz'; | |
5166 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); | |
5167 } | |
5168 else{ | |
5169 open ($fhs{1}->{other_c},"| cat > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); | |
5170 # open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); | |
5171 } | |
5172 | |
5173 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only); | |
5174 push @sorting_files,$other_c_ctot; | |
5175 | |
5176 unless($no_header){ | |
5177 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5178 } | |
5179 | |
5180 $other_c_ctob =~ s/^/Non_CpG_CTOB_/; | |
5181 $other_c_ctob =~ s/sam$/txt/; | |
5182 $other_c_ctob =~ s/bam$/txt/; | |
5183 $other_c_ctob =~ s/cram$/txt/; | |
5184 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); | |
5185 $other_c_ctob = $output_dir . $other_c_ctob; | |
5186 | |
5187 if ($gzip){ | |
5188 $other_c_ctob .= '.gz'; | |
5189 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); | |
5190 } | |
5191 else{ | |
5192 open ($fhs{2}->{other_c},"| cat > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); | |
5193 # open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); | |
5194 } | |
5195 | |
5196 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only); | |
5197 push @sorting_files,$other_c_ctob; | |
5198 | |
5199 unless($no_header){ | |
5200 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5201 } | |
5202 | |
5203 $other_c_ob =~ s/^/Non_CpG_OB_/; | |
5204 $other_c_ob =~ s/sam$/txt/; | |
5205 $other_c_ob =~ s/sam$/txt/; | |
5206 $other_c_ob =~ s/cram$/txt/; | |
5207 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); | |
5208 $other_c_ob = $output_dir . $other_c_ob; | |
5209 | |
5210 if ($gzip){ | |
5211 $other_c_ob .= '.gz'; | |
5212 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); | |
5213 } | |
5214 else{ | |
5215 open ($fhs{3}->{other_c},"| cat > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); | |
5216 # open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); | |
5217 } | |
5218 | |
5219 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only); | |
5220 push @sorting_files,$other_c_ob; | |
5221 | |
5222 unless($no_header){ | |
5223 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5224 } | |
5225 } | |
5226 | |
5227 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) | |
5228 | |
5229 ### if --comprehensive was specified we are only writing one file per context | |
5230 elsif ($full) { | |
5231 my $cpg_output = my $chg_output = my $chh_output = $output_filename; | |
5232 ### C in CpG context | |
5233 $cpg_output =~ s/^/CpG_context_/; | |
5234 $cpg_output =~ s/sam$/txt/; | |
5235 $cpg_output =~ s/bam$/txt/; | |
5236 $cpg_output =~ s/cram$/txt/; | |
5237 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); | |
5238 $cpg_output = $output_dir . $cpg_output; | |
5239 | |
5240 if ($gzip){ | |
5241 $cpg_output .= '.gz'; | |
5242 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); | |
5243 } | |
5244 else{ | |
5245 open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); | |
5246 # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); | |
5247 } | |
5248 | |
5249 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); | |
5250 push @sorting_files,$cpg_output; | |
5251 | |
5252 unless($no_header){ | |
5253 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5254 } | |
5255 | |
5256 ### C in CHG context | |
5257 $chg_output =~ s/^/CHG_context_/; | |
5258 $chg_output =~ s/sam$/txt/; | |
5259 $chg_output =~ s/bam$/txt/; | |
5260 $chg_output =~ s/cram$/txt/; | |
5261 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); | |
5262 $chg_output = $output_dir . $chg_output; | |
5263 | |
5264 if ($gzip){ | |
5265 $chg_output .= '.gz'; | |
5266 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only); | |
5267 } | |
5268 else{ | |
5269 open ($fhs{CHG_context},"| cat > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only); | |
5270 # open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only); | |
5271 } | |
5272 | |
5273 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only); | |
5274 push @sorting_files,$chg_output; | |
5275 | |
5276 unless($no_header){ | |
5277 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5278 } | |
5279 | |
5280 ### C in CHH context | |
5281 $chh_output =~ s/^/CHH_context_/; | |
5282 $chh_output =~ s/sam$/txt/; | |
5283 $chh_output =~ s/bam$/txt/; | |
5284 $chh_output =~ s/cram$/txt/; | |
5285 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); | |
5286 $chh_output = $output_dir . $chh_output; | |
5287 | |
5288 if ($gzip){ | |
5289 $chh_output .= '.gz'; | |
5290 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only); | |
5291 } | |
5292 else{ | |
5293 open ($fhs{CHH_context},"| cat > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only); | |
5294 # open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only); | |
5295 } | |
5296 | |
5297 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only); | |
5298 push @sorting_files, $chh_output; | |
5299 | |
5300 unless($no_header){ | |
5301 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5302 } | |
5303 } | |
5304 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found | |
5305 else { | |
5306 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; | |
5307 | |
5308 ### For cytosines in CpG context | |
5309 $cpg_ot =~ s/^/CpG_OT_/; | |
5310 $cpg_ot =~ s/sam$/txt/; | |
5311 $cpg_ot =~ s/bam$/txt/; | |
5312 $cpg_ot =~ s/cram$/txt/; | |
5313 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); | |
5314 $cpg_ot = $output_dir . $cpg_ot; | |
5315 | |
5316 if ($gzip){ | |
5317 $cpg_ot .= '.gz'; | |
5318 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); | |
5319 } | |
5320 else{ | |
5321 open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); | |
5322 # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); | |
5323 } | |
5324 | |
5325 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); | |
5326 push @sorting_files,$cpg_ot; | |
5327 | |
5328 unless($no_header){ | |
5329 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5330 } | |
5331 | |
5332 $cpg_ctot =~ s/^/CpG_CTOT_/; | |
5333 $cpg_ctot =~ s/sam$/txt/; | |
5334 $cpg_ctot =~ s/bam$/txt/; | |
5335 $cpg_ctot =~ s/cram$/txt/; | |
5336 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); | |
5337 $cpg_ctot = $output_dir . $cpg_ctot; | |
5338 | |
5339 if ($gzip){ | |
5340 $cpg_ctot .= '.gz'; | |
5341 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); | |
5342 } | |
5343 else{ | |
5344 open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); | |
5345 # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); | |
5346 } | |
5347 | |
5348 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); | |
5349 push @sorting_files,$cpg_ctot; | |
5350 | |
5351 unless($no_header){ | |
5352 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5353 } | |
5354 | |
5355 $cpg_ctob =~ s/^/CpG_CTOB_/; | |
5356 $cpg_ctob =~ s/sam$/txt/; | |
5357 $cpg_ctob =~ s/bam$/txt/; | |
5358 $cpg_ctob =~ s/cram$/txt/; | |
5359 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); | |
5360 $cpg_ctob = $output_dir . $cpg_ctob; | |
5361 | |
5362 if ($gzip){ | |
5363 $cpg_ctob .= '.gz'; | |
5364 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); | |
5365 } | |
5366 else{ | |
5367 open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); | |
5368 # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); | |
5369 } | |
5370 | |
5371 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); | |
5372 push @sorting_files,$cpg_ctob; | |
5373 | |
5374 unless($no_header){ | |
5375 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5376 } | |
5377 | |
5378 $cpg_ob =~ s/^/CpG_OB_/; | |
5379 $cpg_ob =~ s/sam$/txt/; | |
5380 $cpg_ob =~ s/bam$/txt/; | |
5381 $cpg_ob =~ s/cram$/txt/; | |
5382 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); | |
5383 $cpg_ob = $output_dir . $cpg_ob; | |
5384 | |
5385 if ($gzip){ | |
5386 $cpg_ob .= '.gz'; | |
5387 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); | |
5388 } | |
5389 else{ | |
5390 open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); | |
5391 # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); | |
5392 } | |
5393 | |
5394 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); | |
5395 push @sorting_files,$cpg_ob; | |
5396 | |
5397 unless($no_header){ | |
5398 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5399 } | |
5400 | |
5401 ### For cytosines in CHG context | |
5402 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; | |
5403 | |
5404 $chg_ot =~ s/^/CHG_OT_/; | |
5405 $chg_ot =~ s/sam$/txt/; | |
5406 $chg_ot =~ s/bam$/txt/; | |
5407 $chg_ot =~ s/cram$/txt/; | |
5408 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); | |
5409 $chg_ot = $output_dir . $chg_ot; | |
5410 | |
5411 if ($gzip){ | |
5412 $chg_ot .= '.gz'; | |
5413 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only); | |
5414 } | |
5415 else{ | |
5416 open ($fhs{0}->{CHG},"| cat > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only); | |
5417 # open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only); | |
5418 } | |
5419 | |
5420 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only); | |
5421 push @sorting_files,$chg_ot; | |
5422 | |
5423 unless($no_header){ | |
5424 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5425 } | |
5426 | |
5427 $chg_ctot =~ s/^/CHG_CTOT_/; | |
5428 $chg_ctot =~ s/sam$/txt/; | |
5429 $chg_ctot =~ s/bam$/txt/; | |
5430 $chg_ctot =~ s/cram$/txt/; | |
5431 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); | |
5432 $chg_ctot = $output_dir . $chg_ctot; | |
5433 | |
5434 if ($gzip){ | |
5435 $chg_ctot .= '.gz'; | |
5436 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); | |
5437 } | |
5438 else{ | |
5439 open ($fhs{1}->{CHG},"| cat > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); | |
5440 # open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); | |
5441 } | |
5442 | |
5443 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only); | |
5444 push @sorting_files,$chg_ctot; | |
5445 | |
5446 unless($no_header){ | |
5447 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5448 } | |
5449 | |
5450 $chg_ctob =~ s/^/CHG_CTOB_/; | |
5451 $chg_ctob =~ s/sam$/txt/; | |
5452 $chg_ctob =~ s/bam$/txt/; | |
5453 $chg_ctob =~ s/cram$/txt/; | |
5454 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); | |
5455 $chg_ctob = $output_dir . $chg_ctob; | |
5456 | |
5457 if ($gzip){ | |
5458 $chg_ctob .= '.gz'; | |
5459 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); | |
5460 } | |
5461 else{ | |
5462 open ($fhs{2}->{CHG},"| cat > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); | |
5463 # open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); | |
5464 } | |
5465 | |
5466 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only); | |
5467 push @sorting_files,$chg_ctob; | |
5468 | |
5469 unless($no_header){ | |
5470 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5471 } | |
5472 | |
5473 $chg_ob =~ s/^/CHG_OB_/; | |
5474 $chg_ob =~ s/sam$/txt/; | |
5475 $chg_ob =~ s/bam$/txt/; | |
5476 $chg_ob =~ s/cram$/txt/; | |
5477 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); | |
5478 $chg_ob = $output_dir . $chg_ob; | |
5479 | |
5480 if ($gzip){ | |
5481 $chg_ob .= '.gz'; | |
5482 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only); | |
5483 } | |
5484 else{ | |
5485 open ($fhs{3}->{CHG},"| cat > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only); | |
5486 # open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only); | |
5487 } | |
5488 | |
5489 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only); | |
5490 push @sorting_files,$chg_ob; | |
5491 | |
5492 unless($no_header){ | |
5493 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5494 } | |
5495 | |
5496 ### For cytosines in CHH context | |
5497 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; | |
5498 | |
5499 $chh_ot =~ s/^/CHH_OT_/; | |
5500 $chh_ot =~ s/sam$/txt/; | |
5501 $chh_ot =~ s/bam$/txt/; | |
5502 $chh_ot =~ s/cram$/txt/; | |
5503 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); | |
5504 $chh_ot = $output_dir . $chh_ot; | |
5505 | |
5506 if ($gzip){ | |
5507 $chh_ot .= '.gz'; | |
5508 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only); | |
5509 } | |
5510 else{ | |
5511 open ($fhs{0}->{CHH},"| cat > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only); | |
5512 # open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only); | |
5513 } | |
5514 | |
5515 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only); | |
5516 push @sorting_files,$chh_ot; | |
5517 | |
5518 unless($no_header){ | |
5519 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5520 } | |
5521 | |
5522 $chh_ctot =~ s/^/CHH_CTOT_/; | |
5523 $chh_ctot =~ s/sam$/txt/; | |
5524 $chh_ctot =~ s/bam$/txt/; | |
5525 $chh_ctot =~ s/cram$/txt/; | |
5526 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); | |
5527 $chh_ctot = $output_dir . $chh_ctot; | |
5528 | |
5529 if ($gzip){ | |
5530 $chh_ctot .= '.gz'; | |
5531 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); | |
5532 } | |
5533 else{ | |
5534 open ($fhs{1}->{CHH},"| cat > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); | |
5535 # open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); | |
5536 } | |
5537 | |
5538 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only); | |
5539 push @sorting_files,$chh_ctot; | |
5540 | |
5541 unless($no_header){ | |
5542 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5543 } | |
5544 | |
5545 $chh_ctob =~ s/^/CHH_CTOB_/; | |
5546 $chh_ctob =~ s/sam$/txt/; | |
5547 $chh_ctob =~ s/bam$/txt/; | |
5548 $chh_ctob =~ s/cram$/txt/; | |
5549 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); | |
5550 $chh_ctob = $output_dir . $chh_ctob; | |
5551 | |
5552 if ($gzip){ | |
5553 $chh_ctob .= '.gz'; | |
5554 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); | |
5555 } | |
5556 else{ | |
5557 open ($fhs{2}->{CHH},"| cat > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); | |
5558 # open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); | |
5559 } | |
5560 | |
5561 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only); | |
5562 push @sorting_files,$chh_ctob; | |
5563 | |
5564 unless($no_header){ | |
5565 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5566 } | |
5567 | |
5568 $chh_ob =~ s/^/CHH_OB_/; | |
5569 $chh_ob =~ s/sam$/txt/; | |
5570 $chh_ob =~ s/bam$/txt/; | |
5571 $chh_ob =~ s/cram$/txt/; | |
5572 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); | |
5573 $chh_ob = $output_dir . $chh_ob; | |
5574 | |
5575 if ($gzip){ | |
5576 $chh_ob .= '.gz'; | |
5577 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only); | |
5578 } | |
5579 else{ | |
5580 open ($fhs{3}->{CHH},"| cat > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only); | |
5581 # open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only); | |
5582 } | |
5583 | |
5584 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only); | |
5585 push @sorting_files,$chh_ob; | |
5586 | |
5587 unless($no_header){ | |
5588 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); | |
5589 } | |
5590 } | |
5591 return $report_filename; | |
5592 } | |
5593 | |
5594 sub isBam{ | |
5595 | |
5596 my $filename = shift; | |
5597 | |
5598 # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy) | |
5599 open (DISGUISE,"gunzip -c $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n"; | |
5600 | |
5601 ### when BAM files read through a gunzip -c stream they start with BAM... | |
5602 my $bam_in_disguise = <DISGUISE>; | |
5603 # warn "BAM in disguise: $bam_in_disguise\n\n"; | |
5604 | |
5605 if ($bam_in_disguise){ | |
5606 if ($bam_in_disguise =~ /^BAM/){ | |
5607 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; | |
5608 return 1; | |
5609 } | |
5610 else{ | |
5611 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; | |
5612 return 0; | |
5613 } | |
5614 } | |
5615 else{ | |
5616 close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; | |
5617 return 0; | |
5618 } | |
5619 } | |
5620 | |
5621 | |
5622 sub print_helpfile{ | |
5623 | |
5624 print << 'HOW_TO'; | |
5625 | |
5626 | |
5627 DESCRIPTION | |
5628 | |
5629 The following is a brief description of all options to control the Bismark | |
5630 methylation extractor. The script reads in a bisulfite read alignment results file | |
5631 produced by the Bismark bisulfite mapper (in BAM/CRAM/SAM format) and extracts the | |
5632 methylation information for individual cytosines. This information is found in the | |
5633 methylation call field which can contain the following characters: | |
5634 | |
5635 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
5636 ~~~ X for methylated C in CHG context ~~~ | |
5637 ~~~ x for not methylated C CHG ~~~ | |
5638 ~~~ H for methylated C in CHH context ~~~ | |
5639 ~~~ h for not methylated C in CHH context ~~~ | |
5640 ~~~ Z for methylated C in CpG context ~~~ | |
5641 ~~~ z for not methylated C in CpG context ~~~ | |
5642 ~~~ U for methylated C in Unknown context (CN or CHN ~~~ | |
5643 ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~ | |
5644 ~~~ . for any bases not involving cytosines ~~~ | |
5645 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
5646 | |
5647 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH | |
5648 context (this distinction is actually already made in Bismark itself). As the methylation | |
5649 information for every C analysed can produce files which easily have tens or even hundreds of | |
5650 millions of lines, file sizes can become very large and more difficult to handle. The C | |
5651 methylation info additionally splits cytosine methylation calls up into one of the four possible | |
5652 strands a given bisulfite read aligned against: | |
5653 | |
5654 OT original top strand | |
5655 CTOT complementary to original top strand | |
5656 | |
5657 OB original bottom strand | |
5658 CTOB complementary to original bottom strand | |
5659 | |
5660 Thus, by default twelve individual output files are being generated per input file (unless | |
5661 --comprehensive is specified, see below). The output files can be imported into a genome | |
5662 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact | |
5663 unless the bisulfite reads were generated preserving directionality it doesn't make any | |
5664 sense to look at the data in a strand-specific manner). Strand-specific output files can | |
5665 optionally be skipped, in which case only three output files for CpG, CHG or CHH context | |
5666 will be generated. For both the strand-specific and comprehensive outputs there is also | |
5667 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context. | |
5668 | |
5669 | |
5670 The output files are in the following format (tab delimited): | |
5671 | |
5672 <sequence_id> <strand> <chromosome> <position> <methylation call> | |
5673 | |
5674 | |
5675 USAGE: bismark_methylation_extractor [options] <filenames> | |
5676 | |
5677 | |
5678 ARGUMENTS: | |
5679 ========== | |
5680 | |
5681 <filenames> A space-separated list of Bismark result files in SAM format from | |
5682 which methylation information is extracted for every cytosine in | |
5683 the reads. For alignment files in the older custom Bismark output | |
5684 see option '--vanilla'. | |
5685 | |
5686 OPTIONS: | |
5687 | |
5688 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end | |
5689 read data. If neither -s nor -p is set the type of experiment will | |
5690 be determined automatically. | |
5691 | |
5692 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end | |
5693 read data. If neither -s nor -p is set the type of experiment will | |
5694 be determined automatically. | |
5695 | |
5696 --vanilla The Bismark result input file(s) are in the old custom Bismark format | |
5697 (up to version 0.5.x) and not in SAM format which is the default as | |
5698 of Bismark version 0.6.x or higher. Default: OFF. | |
5699 | |
5700 --no_overlap For paired-end reads it is theoretically possible that read_1 and | |
5701 read_2 overlap. This option avoids scoring overlapping methylation | |
5702 calls twice (only methylation calls of read 1 are used for in the process | |
5703 since read 1 has historically higher quality basecalls than read 2). | |
5704 Whilst this option removes a bias towards more methylation calls | |
5705 in the center of sequenced fragments it may de facto remove a sizable | |
5706 proportion of the data. This option is on by default for paired-end data | |
5707 but can be disabled using --include_overlap. Default: ON. | |
5708 | |
5709 --include_overlap For paired-end data all methylation calls will be extracted irrespective of | |
5710 of whether they overlap or not. Default: OFF. | |
5711 | |
5712 --ignore <int> Ignore the first <int> bp from the 5' end of Read 1 (or single-end alignment | |
5713 files) when processing the methylation call string. This can remove e.g. a | |
5714 restriction enzyme site at the start of each read or any other source of | |
5715 bias (such as PBAT-Seq data). | |
5716 | |
5717 --ignore_r2 <int> Ignore the first <int> bp from the 5' end of Read 2 of paired-end sequencing | |
5718 results only. Since the first couple of bases in Read 2 of BS-Seq experiments | |
5719 show a severe bias towards non-methylation as a result of end-repairing | |
5720 sonicated fragments with unmethylated cytosines (see M-bias plot), it is | |
5721 recommended that the first couple of bp of Read 2 are removed before | |
5722 starting downstream analysis. Please see the section on M-bias plots in the | |
5723 Bismark User Guide for more details. | |
5724 | |
5725 --ignore_3prime <int> Ignore the last <int> bp from the 3' end of Read 1 (or single-end alignment | |
5726 files) when processing the methylation call string. This can remove unwanted | |
5727 biases from the end of reads. | |
5728 | |
5729 --ignore_3prime_r2 <int> Ignore the last <int> bp from the 3' end of Read 2 of paired-end sequencing | |
5730 results only. This can remove unwanted biases from the end of reads. | |
5731 | |
5732 --comprehensive Specifying this option will merge all four possible strand-specific | |
5733 methylation info into context-dependent output files. The default | |
5734 | |
5735 contexts are: | |
5736 - CpG context | |
5737 - CHG context | |
5738 - CHH context | |
5739 | |
5740 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight | |
5741 strand-specific output files (default) for Cs in | |
5742 - CpG context | |
5743 - non-CpG context | |
5744 | |
5745 --report Prints out a short methylation summary as well as the paramaters used to run | |
5746 this script. Default: ON. | |
5747 | |
5748 --no_header Suppresses the Bismark version header line in all output files for more convenient | |
5749 batch processing. | |
5750 | |
5751 -o/--output DIR Allows specification of a different output directory (absolute or relative | |
5752 path). If not specified explicitly, the output will be written to the current directory. | |
5753 | |
5754 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified | |
5755 explicitly if Samtools is in the PATH already. | |
5756 | |
5757 --gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in | |
5758 a GZIP compressed form to save disk space. This option is also passed on to the genome-wide | |
5759 cytosine report. BedGraph and coverage files will be written out as .gz by default. | |
5760 | |
5761 --version Displays version information. | |
5762 | |
5763 -h/--help Displays this help file and exits. | |
5764 | |
5765 --mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as | |
5766 well as a report (optional) and then quit. Default: OFF. | |
5767 | |
5768 --mbias_off The methylation extractor will process the entire file as usual but doesn't write out any M-bias report. | |
5769 Only recommended for users who deliberately want to keep an earlier version of the M-bias report. | |
5770 Default: OFF. | |
5771 | |
5772 --multicore <int> Sets the number of cores to be used for the methylation extraction process. If system resources | |
5773 are plentiful this is a viable option to speed up the extraction process (we observed a near linear | |
5774 speed increase for up to 10 cores used). Please note that a typical process of extracting a BAM file | |
5775 and writing out '.gz' output streams will in fact use ~3 cores per value of --multicore <int> | |
5776 specified (1 for the methylation extractor itself, 1 for a Samtools stream, 1 for GZIP stream), so | |
5777 --multicore 10 is likely to use around 30 cores of system resources. This option has no bearing | |
5778 on the bismark2bedGraph or genome-wide cytosine report processes. | |
5779 | |
5780 | |
5781 | |
5782 | |
5783 bedGraph specific options: | |
5784 ========================== | |
5785 | |
5786 --bedGraph After finishing the methylation extraction, the methylation output is written into a | |
5787 sorted bedGraph file that reports the position of a given cytosine and its methylation | |
5788 state (in %, see details below). The methylation extractor output is temporarily split up into | |
5789 temporary files, one per chromosome (written into the current directory or folder | |
5790 specified with -o/--output); these temp files are then used for sorting and deleted | |
5791 afterwards. By default, only cytosines in CpG context will be sorted. The option | |
5792 '--CX_context' may be used to report all cytosines irrespective of sequence context | |
5793 (this will take MUCH longer!). The default folder for temporary files during the sorting | |
5794 process is the output directory. The bedGraph conversion step is performed by the external | |
5795 module 'bismark2bedGraph'; this script needs to reside in the same folder as the | |
5796 bismark_methylation_extractor itself. | |
5797 | |
5798 --zero_based Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start | |
5799 and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file, | |
5800 instead of using 1-based coordinates throughout. Default: OFF. | |
5801 | |
5802 | |
5803 --cutoff [threshold] The minimum number of times any methylation state (methylated or unmethylated) has to be seen | |
5804 for a nucleotide before its methylation percentage is reported. Default: 1. | |
5805 | |
5806 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. | |
5807 | |
5808 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered | |
5809 in the experiment irrespective of its sequence context. This applies to both forward and | |
5810 reverse strands. Please be aware that this option may generate large temporary and output files | |
5811 and may take a long time to sort (up to many hours). Default: OFF. | |
5812 (i.e. Default = CpG context only). | |
5813 | |
5814 --buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information. | |
5815 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or | |
5816 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. | |
5817 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. | |
5818 Defaults to 2G. | |
5819 | |
5820 --scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of | |
5821 scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to | |
5822 individual chromosome files. These errors were caused by the operating system's limit | |
5823 of the number of filehandle that can be written to at any one time (typically 1024; to | |
5824 find out this limit on Linux, type: ulimit -a). | |
5825 To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort | |
5826 methylation calls into individual chromosome files. Instead, all input files are | |
5827 temporarily merged into a single file (unless there is only a single file), and this | |
5828 file will then be sorted by both chromosome AND position using the Unix sort command. | |
5829 Please be aware that this option might take a looooong time to complete, depending on | |
5830 the size of the input files, and the memory you allocate to this process (see --buffer_size). | |
5831 Nevertheless, it seems to be working. | |
5832 | |
5833 --ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will | |
5834 instead use two arrays to sort methylated and unmethylated calls. This may result in a faster | |
5835 sorting process of very large files, but this comes at the cost of a larger memory footprint | |
5836 (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB | |
5837 of RAM). Due to overheads in creating and looping through these arrays it seems that it will | |
5838 actually be *slower* for small files (few million alignments), and we are currently testing at | |
5839 which point it is advisable to use this option. Note that --ample_memory is not compatible | |
5840 with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with). | |
5841 | |
5842 | |
5843 | |
5844 Genome-wide cytosine methylation report specific options: | |
5845 ========================================================= | |
5846 | |
5847 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a | |
5848 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based | |
5849 chromosome coordinates (zero-based start coords are optional) and reports CpG context only (all | |
5850 cytosine context is optional). The output considers all Cs on both forward and reverse strands and | |
5851 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not | |
5852 covered). The cytosine report conversion step is performed by the external module | |
5853 'coverage2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor | |
5854 itself. | |
5855 | |
5856 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of | |
5857 its context. This applies to both forward and reverse strands. Please be aware that this will | |
5858 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. | |
5859 Default: OFF (i.e. Default = CpG context only). | |
5860 | |
5861 --zero_based Uses 0-based genomic coordinates instead of 1-based coordinates. Default: OFF. | |
5862 | |
5863 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted | |
5864 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. | |
5865 | |
5866 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files | |
5867 will be named to include the input filename and the chromosome number. | |
5868 | |
5869 | |
5870 | |
5871 OUTPUT: | |
5872 | |
5873 The bismark_methylation_extractor output is in the form: | |
5874 ======================================================== | |
5875 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call> | |
5876 | |
5877 * Methylated cytosines receive a '+' orientation, | |
5878 * Unmethylated cytosines receive a '-' orientation. | |
5879 | |
5880 | |
5881 | |
5882 The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords): | |
5883 ========================================================================================================= | |
5884 | |
5885 track type=bedGraph (header line) | |
5886 | |
5887 <chromosome> <start position> <end position> <methylation percentage> | |
5888 | |
5889 | |
5890 | |
5891 The coverage output looks like this (tab-delimited, 1-based genomic coords; zero-based half-open coordinates available with '--zero_based'): | |
5892 ============================================================================================================================================ | |
5893 | |
5894 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated> | |
5895 | |
5896 | |
5897 | |
5898 The genome-wide cytosine methylation output file is tab-delimited in the following format: | |
5899 ========================================================================================== | |
5900 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> | |
5901 | |
5902 | |
5903 | |
5904 This script was last modified on 25 February 2016. | |
5905 | |
5906 HOW_TO | |
5907 } |