0
|
1 #!/usr/bin/perl
|
|
2 use warnings;
|
|
3 use strict;
|
|
4 $|++;
|
|
5 use Getopt::Long;
|
|
6 use Cwd;
|
|
7 use Carp;
|
|
8 use FindBin qw($Bin);
|
|
9 use lib "$Bin/../lib";
|
|
10
|
3
|
11
|
0
|
12 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
|
|
13
|
|
14 ## This program is free software: you can redistribute it and/or modify
|
|
15 ## it under the terms of the GNU General Public License as published by
|
|
16 ## the Free Software Foundation, either version 3 of the License, or
|
|
17 ## (at your option) any later version.
|
|
18
|
|
19 ## This program is distributed in the hope that it will be useful,
|
|
20 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
21 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
22 ## GNU General Public License for more details.
|
|
23
|
|
24 ## You should have received a copy of the GNU General Public License
|
|
25 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
26
|
|
27 my @filenames; # input files
|
|
28 my %counting;
|
|
29 my $parent_dir = getcwd();
|
|
30
|
|
31 my %fhs;
|
|
32
|
3
|
33 my $version = 'v0.10.1';
|
|
34 my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem) = process_commandline();
|
0
|
35
|
|
36
|
|
37 ### only needed for bedGraph output
|
|
38 my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files
|
|
39 my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
|
|
40 my @bedfiles;
|
|
41
|
|
42 ### only needed for genome-wide cytosine methylation report
|
|
43 my %chromosomes;
|
|
44
|
3
|
45 my %mbias_1;
|
|
46 my %mbias_2;
|
|
47
|
0
|
48 ##############################################################################################
|
|
49 ### Summarising Run Parameters
|
|
50 ##############################################################################################
|
|
51
|
|
52 ### METHYLATION EXTRACTOR
|
|
53
|
|
54 warn "Summarising Bismark methylation extractor parameters:\n";
|
|
55 warn '='x63,"\n";
|
|
56
|
|
57 if ($single){
|
|
58 if ($vanilla){
|
|
59 warn "Bismark single-end vanilla format specified\n";
|
|
60 }
|
|
61 else{
|
|
62 warn "Bismark single-end SAM format specified (default)\n"; # default
|
|
63 }
|
|
64 }
|
|
65 elsif ($paired){
|
|
66 if ($vanilla){
|
|
67 warn "Bismark paired-end vanilla format specified\n";
|
|
68 }
|
|
69 else{
|
|
70 warn "Bismark paired-end SAM format specified (default)\n"; # default
|
|
71 }
|
|
72 }
|
|
73
|
3
|
74 if ($single){
|
|
75 if ($ignore){
|
|
76 warn "First $ignore bp will be disregarded when processing the methylation call string\n";
|
|
77 }
|
0
|
78 }
|
3
|
79 else{ ## paired-end
|
|
80 if ($ignore){
|
|
81 warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n";
|
|
82 }
|
|
83 if ($ignore_r2){
|
|
84 warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n";
|
|
85 }
|
|
86 }
|
|
87
|
0
|
88
|
|
89 if ($full){
|
|
90 warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n";
|
|
91 }
|
|
92 if ($merge_non_CpG){
|
|
93 warn "Merge CHG and CHH context to non-CpG context specified\n";
|
|
94 }
|
|
95 ### output directory
|
|
96 if ($output_dir eq ''){
|
|
97 warn "Output will be written to the current directory ('$parent_dir')\n";
|
|
98 }
|
|
99 else{
|
|
100 warn "Output path specified as: $output_dir\n";
|
|
101 }
|
|
102
|
|
103
|
|
104 sleep (1);
|
|
105
|
|
106 ### BEDGRAPH
|
|
107
|
|
108 if ($bedGraph){
|
|
109 warn "\n\nSummarising bedGraph parameters:\n";
|
|
110 warn '='x63,"\n";
|
|
111
|
|
112 if ($counts){
|
3
|
113 warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage>\ncoverage format:\t<Chromosome> <Start Position> <End Position> <Methylation Percentage> <count methylated> <count non-methylated>\n\n";
|
0
|
114 }
|
|
115 else{
|
|
116 warn "Generating additional sorted output in bedGraph format (output format: <Chromosome> <Start Position> <End Position> <Methylation Percentage>)\n";
|
|
117 }
|
|
118
|
|
119 warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n";
|
|
120
|
|
121 if ($CX_context){
|
|
122 warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n";
|
|
123 }
|
|
124 else{ # default
|
|
125 $CpG_only = 1;
|
|
126 warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n";
|
|
127 }
|
|
128
|
|
129 if ($remove){
|
|
130 warn "White spaces in read ID names will be removed prior to sorting\n";
|
|
131 }
|
|
132
|
3
|
133 if ($ample_mem){
|
|
134 warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n";
|
|
135 }
|
|
136 elsif (defined $sort_size){
|
0
|
137 warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n";
|
|
138 }
|
|
139 else{
|
|
140 warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n";
|
|
141 }
|
|
142
|
|
143
|
|
144
|
|
145 sleep (1);
|
|
146
|
|
147 if ($cytosine_report){
|
|
148 warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n";
|
|
149 warn '='x63,"\n";
|
|
150 warn "Generating comprehensive genome-wide cytosine report\n(output format: <Chromosome> <Position> <Strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> )\n";
|
|
151
|
|
152
|
|
153 if ($CX_context){
|
|
154 warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n";
|
|
155 }
|
|
156 else{ # default
|
|
157 $CpG_only = 1;
|
|
158 warn "Reporting cytosine methylation in CpG context only (default)\n";
|
|
159 }
|
|
160
|
|
161 if ($split_by_chromosome){
|
|
162 warn "Splitting the cytosine report output up into individual files for each chromosome\n";
|
|
163 }
|
|
164
|
|
165 ### Zero-based coordinates
|
|
166 if ($zero){
|
|
167 warn "Using zero-based genomic coordinates (user-defined)\n";
|
|
168 }
|
|
169 else{ # default, 1-based coords
|
|
170 warn "Using 1-based genomic coordinates (default)\n";
|
|
171 }
|
|
172
|
|
173 ### GENOME folder
|
|
174 if ($genome_folder){
|
|
175 unless ($genome_folder =~/\/$/){
|
|
176 $genome_folder =~ s/$/\//;
|
|
177 }
|
|
178 warn "Genome folder was specified as $genome_folder\n";
|
|
179 }
|
|
180 else{
|
|
181 $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/';
|
|
182 warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n";
|
|
183 }
|
|
184 sleep (1);
|
|
185 }
|
|
186 }
|
|
187
|
|
188 warn "\n";
|
|
189 sleep (5);
|
|
190
|
|
191 ######################################################
|
|
192 ### PROCESSING FILES
|
|
193 ######################################################
|
|
194
|
|
195 foreach my $filename (@filenames){
|
|
196 # resetting counters and filehandles
|
|
197 %fhs = ();
|
|
198 %counting =(
|
|
199 total_meCHG_count => 0,
|
|
200 total_meCHH_count => 0,
|
|
201 total_meCpG_count => 0,
|
|
202 total_unmethylated_CHG_count => 0,
|
|
203 total_unmethylated_CHH_count => 0,
|
|
204 total_unmethylated_CpG_count => 0,
|
|
205 sequences_count => 0,
|
|
206 );
|
3
|
207
|
0
|
208 @sorting_files = ();
|
|
209 @bedfiles = ();
|
|
210
|
3
|
211 %mbias_1 = ();
|
|
212 %mbias_2 = ();
|
|
213
|
|
214 ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor
|
|
215 unless ($vanilla){
|
|
216 if ($paired){
|
|
217 test_positional_sorting($filename);
|
|
218 }
|
|
219 }
|
|
220
|
0
|
221 process_Bismark_results_file($filename);
|
|
222
|
|
223 ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues
|
|
224 foreach my $fh (keys %fhs) {
|
|
225 if ($fh =~ /^[1230]$/) {
|
|
226 foreach my $context (keys %{$fhs{$fh}}) {
|
|
227 close $fhs{$fh}->{$context} or die $!;
|
|
228 }
|
3
|
229 }
|
|
230 else{
|
0
|
231 close $fhs{$fh} or die $!;
|
|
232 }
|
|
233 }
|
|
234
|
3
|
235 ### printing out all M-Bias data
|
|
236 produce_mbias_plots ($filename);
|
|
237
|
|
238 delete_unused_files();
|
|
239
|
0
|
240 if ($bedGraph){
|
|
241
|
|
242 my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
|
|
243 $out =~ s/gz$//;
|
|
244 $out =~ s/sam$//;
|
|
245 $out =~ s/bam$//;
|
|
246 $out =~ s/txt$//;
|
|
247 $out =~ s/$/bedGraph/;
|
|
248
|
|
249 my $bedGraph_output = $out;
|
|
250 my @args;
|
|
251
|
|
252 if ($remove){
|
|
253 push @args, '--remove';
|
|
254 }
|
|
255 if ($CX_context){
|
|
256 push @args, '--CX_context';
|
|
257 }
|
|
258 if ($no_header){
|
|
259 push @args, '--no_header';
|
|
260 }
|
3
|
261 if ($gazillion){
|
|
262 push @args, '--gazillion';
|
|
263 }
|
|
264 if ($ample_mem){
|
|
265 push @args, '--ample_memory';
|
0
|
266 }
|
|
267
|
3
|
268
|
|
269 # if ($counts){
|
|
270 # push @args, "--counts";
|
|
271 # }
|
|
272
|
0
|
273 push @args, "--buffer_size $sort_size";
|
|
274 push @args, "--cutoff $coverage_threshold";
|
|
275 push @args, "--output $bedGraph_output";
|
|
276 push @args, "--dir '$output_dir'";
|
|
277
|
|
278 ### adding all files to be sorted to @args
|
|
279 foreach my $f (@sorting_files){
|
|
280 push @args, $f;
|
|
281 }
|
|
282
|
|
283 # print join "\t",@args,"\n";
|
|
284
|
|
285 system ("$Bin/bismark2bedGraph @args");
|
|
286
|
|
287 warn "Finished BedGraph conversion ...\n\n";
|
|
288 sleep(3);
|
|
289
|
|
290 # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!";
|
|
291 # warn "Writing bedGraph to file: $bedGraph_output\n";
|
|
292 # process_bedGraph_output();
|
|
293 # close OUT or die $!;
|
|
294
|
|
295 ### genome-wide cytosine methylation report requires bedGraph processing anyway
|
|
296 if ($cytosine_report){
|
3
|
297
|
0
|
298 @args = (); # resetting @args
|
|
299 my $cytosine_out = $out;
|
|
300 $cytosine_out =~ s/bedGraph$//;
|
|
301
|
|
302 if ($CX_context){
|
|
303 $cytosine_out =~ s/$/CX_report.txt/;
|
|
304 }
|
|
305 else{
|
|
306 $cytosine_out =~ s/$/CpG_report.txt/;
|
|
307 }
|
|
308
|
|
309 push @args, "--output $cytosine_out";
|
|
310 push @args, "--dir '$output_dir'";
|
|
311 push @args, "--genome '$genome_folder'";
|
|
312 push @args, "--parent_dir '$parent_dir'";
|
|
313
|
|
314 if ($zero){
|
|
315 push @args, "--zero";
|
|
316 }
|
|
317 if ($CX_context){
|
|
318 push @args, '--CX_context';
|
|
319 }
|
|
320 if ($split_by_chromosome){
|
|
321 push @args, '--split_by_chromosome';
|
|
322 }
|
|
323
|
3
|
324 my $coverage_output = $bedGraph_output;
|
|
325 $coverage_output =~ s/bedGraph$/bismark.cov/;
|
|
326
|
|
327 push @args, $output_dir . $coverage_output; # this will be the infile
|
|
328
|
|
329 system ("$Bin/coverage2cytosine @args");
|
0
|
330 # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out);
|
|
331 warn "\n\nFinished generating genome-wide cytosine report\n\n";
|
|
332 }
|
|
333 }
|
|
334 }
|
|
335
|
3
|
336 sub delete_unused_files{
|
|
337
|
|
338 warn "Deleting unused files ...\n\n"; sleep(1);
|
|
339
|
|
340 my $index = 0;
|
|
341
|
|
342 while ($index <= $#sorting_files){
|
|
343 if ($sorting_files[$index] =~ /gz$/){
|
|
344 open (USED,"zcat $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
|
|
345 }
|
|
346 else{
|
|
347 open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n";
|
|
348 }
|
|
349
|
|
350 my $used = 0;
|
|
351
|
|
352 while (<USED>){
|
|
353 next if (/^Bismark/);
|
|
354 if ($_){
|
|
355 $used = 1;
|
|
356 last;
|
|
357 }
|
|
358 }
|
|
359
|
|
360 if ($used){
|
|
361 warn "$sorting_files[$index] contains data ->\tkept\n";
|
|
362 ++$index;
|
|
363 }
|
|
364 else{
|
|
365
|
|
366 my $delete = unlink $sorting_files[$index];
|
|
367
|
|
368 if ($delete){
|
|
369 warn "$sorting_files[$index] was empty ->\tdeleted\n";
|
|
370 }
|
|
371 else{
|
|
372 warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n"
|
|
373 }
|
|
374
|
|
375 ### we also need to remove the element from @sorting_files
|
|
376 splice @sorting_files, $index, 1;
|
|
377 }
|
|
378 }
|
|
379 warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately)
|
|
380 }
|
|
381
|
|
382 sub produce_mbias_plots{
|
|
383
|
|
384 my $filename = shift;
|
|
385
|
|
386 my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
|
|
387 $mbias =~ s/gz$//;
|
|
388 $mbias =~ s/sam$//;
|
|
389 $mbias =~ s/bam$//;
|
|
390 $mbias =~ s/txt$//;
|
|
391 my $mbias_graph_1 = my $mbias_graph_2 = $mbias;
|
|
392 $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png';
|
|
393 $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png';
|
|
394
|
|
395 $mbias =~ s/$/M-bias.txt/;
|
|
396
|
|
397 open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n";
|
|
398
|
|
399 # determining maximum read length
|
|
400 my $max_length_1 = 0;
|
|
401 my $max_length_2 = 0;
|
|
402
|
|
403 foreach my $context (keys %mbias_1){
|
|
404 foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){
|
|
405 $max_length_1 = $pos unless ($max_length_1 >= $pos);
|
|
406 }
|
|
407 }
|
|
408 if ($paired){
|
|
409 foreach my $context (keys %mbias_2){
|
|
410 foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){
|
|
411 $max_length_2 = $pos unless ($max_length_2 >= $pos);
|
|
412 }
|
|
413 }
|
|
414 }
|
|
415
|
|
416 if ($single){
|
|
417 warn "Determining maximum read length for M-Bias plot\n";
|
|
418 warn "Maximum read length of Read 1: $max_length_1\n\n";
|
|
419 }
|
|
420 else{
|
|
421 warn "Determining maximum read lengths for M-Bias plots\n";
|
|
422 warn "Maximum read length of Read 1: $max_length_1\n";
|
|
423 warn "Maximum read length of Read 2: $max_length_2\n\n";
|
|
424 }
|
|
425 # sleep(3);
|
|
426
|
|
427 my @mbias_read1;
|
|
428 my @mbias_read2;
|
|
429
|
|
430 #Check whether the module GD::Graph:lines is installed
|
|
431 my $gd_graph_installed = 0;
|
|
432 eval{
|
|
433 require GD::Graph::lines;
|
|
434 GD::Graph::lines->import();
|
|
435 };
|
|
436
|
|
437 unless($@) { # syntax or routine error variable, set if something goes wron in the last eval{ require ...}
|
|
438 $gd_graph_installed = 1;
|
|
439
|
|
440 #Check whether the module GD::Graph::colour is installed
|
|
441 eval{
|
|
442 require GD::Graph::colour;
|
|
443 GD::Graph::colour->import(qw(:colours :lists :files :convert));
|
|
444 };
|
|
445
|
|
446 if ($@) {
|
|
447 warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
|
|
448 sleep(2);
|
|
449 $gd_graph_installed = 0;
|
|
450 }
|
|
451
|
|
452
|
|
453 }
|
|
454 else{
|
|
455 warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n";
|
|
456 sleep(2);
|
|
457 }
|
|
458
|
|
459
|
|
460 my $graph_title;
|
|
461 my $graph1;
|
|
462 my $graph2;
|
|
463
|
|
464 if ( $gd_graph_installed){
|
|
465 $graph1 = GD::Graph::lines->new(800,600);
|
|
466 if ($paired){
|
|
467 $graph2 = GD::Graph::lines->new(800,600);
|
|
468 }
|
|
469 }
|
|
470
|
|
471 foreach my $context (qw(CpG CHG CHH)){
|
|
472 @{$mbias_read1[0]} = ();
|
|
473
|
|
474 if ($paired){
|
|
475 print MBIAS "$context context (R1)\n================\n";
|
|
476 $graph_title = 'M-bias (Read 1)';
|
|
477 }
|
|
478 else{
|
|
479 print MBIAS "$context context\n===========\n";
|
|
480 $graph_title = 'M-bias';
|
|
481 }
|
|
482 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
|
|
483
|
|
484 foreach my $pos (1..$max_length_1){
|
|
485
|
|
486 unless (defined $mbias_1{$context}->{$pos}->{meth}){
|
|
487 $mbias_1{$context}->{$pos}->{meth} = 0;
|
|
488 }
|
|
489 unless (defined $mbias_1{$context}->{$pos}->{un}){
|
|
490 $mbias_1{$context}->{$pos}->{un} = 0;
|
|
491 }
|
|
492
|
|
493 my $percent = '';
|
|
494 if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){
|
|
495 $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) );
|
|
496 }
|
|
497 my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth};
|
|
498
|
|
499 print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n";
|
|
500 push @{$mbias_read1[0]},$pos;
|
|
501
|
|
502 if ($context eq 'CpG'){
|
|
503 push @{$mbias_read1[1]},$percent;
|
|
504 push @{$mbias_read1[4]},$coverage;
|
|
505 }
|
|
506 elsif ($context eq 'CHG'){
|
|
507 push @{$mbias_read1[2]},$percent;
|
|
508 push @{$mbias_read1[5]},$coverage;
|
|
509 }
|
|
510 elsif ($context eq 'CHH'){
|
|
511 push @{$mbias_read1[3]},$percent;
|
|
512 push @{$mbias_read1[6]},$coverage;
|
|
513 }
|
|
514 }
|
|
515 print MBIAS "\n";
|
|
516 }
|
|
517
|
|
518 if ( $gd_graph_installed){
|
|
519
|
|
520 add_colour(nice_blue => [31,120,180]);
|
|
521 add_colour(nice_orange => [255,127,0]);
|
|
522 add_colour(nice_green => [51,160,44]);
|
|
523 add_colour(pale_blue => [153,206,227]);
|
|
524 add_colour(pale_orange => [253,204,138]);
|
|
525 add_colour(pale_green => [191,230,207]);
|
|
526
|
|
527 $graph1->set(
|
|
528 x_label => 'position (bp)',
|
|
529 y1_label => '% methylation',
|
|
530 y2_label => '# methylation calls',
|
|
531 title => $graph_title,
|
|
532 line_width => 2,
|
|
533 x_max_value => $max_length_1,
|
|
534 x_min_value => 0,
|
|
535 y_tick_number => 10,
|
|
536 y_label_skip => 2,
|
|
537 y1_max_value => 100,
|
|
538 y1_min_value => 0,
|
|
539 y_label_skip => 2,
|
|
540 y2_min_value => 0,
|
|
541 x_label_skip => 5,
|
|
542 x_label_position => 0.5,
|
|
543 x_tick_offset => -1,
|
|
544 bgclr => 'white',
|
|
545 transparent => 0,
|
|
546 two_axes => 1,
|
|
547 use_axis => [1,1,1,2,2,2],
|
|
548 legend_placement => 'RC',
|
|
549 legend_spacing => 6,
|
|
550 legend_marker_width => 24,
|
|
551 legend_marker_height => 18,
|
|
552 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
|
|
553 ) or die $graph1->error;
|
|
554
|
|
555 $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
|
|
556
|
|
557 my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error;
|
|
558
|
|
559 open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n";
|
|
560 binmode MBIAS_G1;
|
|
561 print MBIAS_G1 $gd1->png;
|
|
562 }
|
|
563
|
|
564 if ($paired){
|
|
565
|
|
566 foreach my $context (qw(CpG CHG CHH)){
|
|
567 @{$mbias_read2[0]} = ();
|
|
568
|
|
569 print MBIAS "$context context (R2)\n================\n";
|
|
570 print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n";
|
|
571 foreach my $pos (1..$max_length_2){
|
|
572
|
|
573 unless (defined $mbias_2{$context}->{$pos}->{meth}){
|
|
574 $mbias_2{$context}->{$pos}->{meth} = 0;
|
|
575 }
|
|
576 unless (defined $mbias_2{$context}->{$pos}->{un}){
|
|
577 $mbias_2{$context}->{$pos}->{un} = 0;
|
|
578 }
|
|
579
|
|
580 my $percent = '';
|
|
581 if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){
|
|
582 $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) );
|
|
583 }
|
|
584 my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth};
|
|
585
|
|
586 print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n";
|
|
587
|
|
588 push @{$mbias_read2[0]},$pos;
|
|
589
|
|
590 if ($context eq 'CpG'){
|
|
591 push @{$mbias_read2[1]},$percent;
|
|
592 push @{$mbias_read2[4]},$coverage;
|
|
593 }
|
|
594 elsif ($context eq 'CHG'){
|
|
595 push @{$mbias_read2[2]},$percent;
|
|
596 push @{$mbias_read2[5]},$coverage;
|
|
597 }
|
|
598 elsif ($context eq 'CHH'){
|
|
599 push @{$mbias_read2[3]},$percent;
|
|
600 push @{$mbias_read2[6]},$coverage;
|
|
601 }
|
|
602 }
|
|
603 print MBIAS "\n";
|
|
604 }
|
|
605
|
|
606 if ( $gd_graph_installed){
|
|
607
|
|
608 add_colour(nice_blue => [31,120,180]);
|
|
609 add_colour(nice_orange => [255,127,0]);
|
|
610 add_colour(nice_green => [51,160,44]);
|
|
611 add_colour(pale_blue => [153,206,227]);
|
|
612 add_colour(pale_orange => [253,204,138]);
|
|
613 add_colour(pale_green => [191,230,207]);
|
|
614
|
|
615 $graph2->set(
|
|
616 x_label => 'position (bp)',
|
|
617 line_width => 2,
|
|
618 x_max_value => $max_length_1,
|
|
619 x_min_value => 0,
|
|
620 y_tick_number => 10,
|
|
621 y_label_skip => 2,
|
|
622 y1_max_value => 100,
|
|
623 y1_min_value => 0,
|
|
624 y_label_skip => 2,
|
|
625 y2_min_value => 0,
|
|
626 x_label_skip => 5,
|
|
627 x_label_position => 0.5,
|
|
628 x_tick_offset => -1,
|
|
629 bgclr => 'white',
|
|
630 transparent => 0,
|
|
631 two_axes => 1,
|
|
632 use_axis => [1,1,1,2,2,2],
|
|
633 legend_placement => 'RC',
|
|
634 legend_spacing => 6,
|
|
635 legend_marker_width => 24,
|
|
636 legend_marker_height => 18,
|
|
637 dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)],
|
|
638 x_label => 'position (bp)',
|
|
639 y1_label => '% methylation',
|
|
640 y2_label => '# calls',
|
|
641 title => 'M-bias (Read 2)',
|
|
642 ) or die $graph2->error;
|
|
643
|
|
644 $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls');
|
|
645 my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error;
|
|
646
|
|
647 open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n";
|
|
648 binmode MBIAS_G2;
|
|
649 print MBIAS_G2 $gd2->png;
|
|
650
|
|
651 }
|
|
652 }
|
|
653 }
|
0
|
654
|
|
655 sub process_commandline{
|
|
656 my $help;
|
|
657 my $single_end;
|
|
658 my $paired_end;
|
|
659 my $ignore;
|
3
|
660 my $ignore_r2;
|
0
|
661 my $genomic_fasta;
|
|
662 my $full;
|
|
663 my $report;
|
|
664 my $extractor_version;
|
|
665 my $no_overlap;
|
|
666 my $merge_non_CpG;
|
|
667 my $vanilla;
|
|
668 my $output_dir;
|
|
669 my $no_header;
|
|
670 my $bedGraph;
|
|
671 my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status
|
|
672 my $remove;
|
|
673 my $counts;
|
|
674 my $cytosine_report;
|
|
675 my $genome_folder;
|
|
676 my $zero;
|
|
677 my $CpG_only;
|
|
678 my $CX_context;
|
|
679 my $split_by_chromosome;
|
|
680 my $sort_size;
|
|
681 my $samtools_path;
|
|
682 my $gzip;
|
3
|
683 my $mbias_only;
|
|
684 my $gazillion;
|
|
685 my $ample_mem;
|
|
686
|
|
687 my $command_line = GetOptions ('help|man' => \$help,
|
|
688 'p|paired-end' => \$paired_end,
|
|
689 's|single-end' => \$single_end,
|
|
690 'fasta' => \$genomic_fasta,
|
|
691 'ignore=i' => \$ignore,
|
|
692 'ignore_r2=i' => \$ignore_r2,
|
|
693 'comprehensive' => \$full,
|
|
694 'report' => \$report,
|
|
695 'version' => \$extractor_version,
|
|
696 'no_overlap' => \$no_overlap,
|
|
697 'merge_non_CpG' => \$merge_non_CpG,
|
|
698 'vanilla' => \$vanilla,
|
|
699 'o|output=s' => \$output_dir,
|
|
700 'no_header' => \$no_header,
|
|
701 'bedGraph' => \$bedGraph,
|
|
702 "cutoff=i" => \$coverage_threshold,
|
|
703 "remove_spaces" => \$remove,
|
|
704 "counts" => \$counts,
|
|
705 "cytosine_report" => \$cytosine_report,
|
|
706 'g|genome_folder=s' => \$genome_folder,
|
|
707 "zero_based" => \$zero,
|
|
708 "CX|CX_context" => \$CX_context,
|
|
709 "split_by_chromosome" => \$split_by_chromosome,
|
|
710 "buffer_size=s" => \$sort_size,
|
|
711 'samtools_path=s' => \$samtools_path,
|
|
712 "gzip" => \$gzip,
|
|
713 "mbias_only" => \$mbias_only,
|
|
714 "gazillion|scaffolds" => \$gazillion,
|
|
715 "ample_memory" => \$ample_mem,
|
|
716 );
|
0
|
717
|
|
718 ### EXIT ON ERROR if there were errors with any of the supplied options
|
|
719 unless ($command_line){
|
|
720 die "Please respecify command line options\n";
|
|
721 }
|
|
722
|
|
723 ### HELPFILE
|
|
724 if ($help){
|
|
725 print_helpfile();
|
|
726 exit;
|
|
727 }
|
|
728
|
|
729 if ($extractor_version){
|
|
730 print << "VERSION";
|
|
731
|
|
732
|
|
733 Bismark Methylation Extractor
|
|
734
|
|
735 Bismark Extractor Version: $version
|
|
736 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
|
|
737 www.bioinformatics.babraham.ac.uk/projects/bismark/
|
|
738
|
|
739
|
|
740 VERSION
|
|
741 exit;
|
|
742 }
|
|
743
|
|
744
|
|
745 ### no files provided
|
|
746 unless (@ARGV){
|
|
747 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
|
|
748 }
|
|
749 @filenames = @ARGV;
|
|
750
|
|
751 warn "\n *** Bismark methylation extractor version $version ***\n\n";
|
|
752
|
3
|
753 ### M-BIAS ONLY
|
|
754 if ($mbias_only){
|
|
755 if ($bedGraph){
|
|
756 die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n";
|
|
757 }
|
|
758 if ($cytosine_report){
|
|
759 die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n";
|
|
760 }
|
|
761 if ($merge_non_CpG){
|
|
762 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n";
|
|
763 }
|
|
764 if ($full){
|
|
765 warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n";
|
|
766 }
|
|
767 sleep(3);
|
0
|
768 }
|
3
|
769
|
0
|
770 ### PRINT A REPORT
|
|
771 unless ($report){
|
|
772 $report = 0;
|
|
773 }
|
|
774
|
|
775 ### OUTPUT DIR PATH
|
|
776 if ($output_dir){
|
|
777 unless ($output_dir =~ /\/$/){
|
|
778 $output_dir =~ s/$/\//;
|
|
779 }
|
|
780 }
|
|
781 else{
|
|
782 $output_dir = '';
|
|
783 }
|
|
784
|
|
785 ### NO HEADER
|
|
786 unless ($no_header){
|
|
787 $no_header = 0;
|
|
788 }
|
|
789
|
|
790 ### OLD (VANILLA) OUTPUT FORMAT
|
|
791 unless ($vanilla){
|
|
792 $vanilla = 0;
|
|
793 }
|
|
794
|
|
795 if ($single_end){
|
|
796 $paired_end = 0; ### SINGLE END ALIGNMENTS
|
|
797 }
|
|
798 elsif ($paired_end){
|
|
799 $single_end = 0; ### PAIRED-END ALIGNMENTS
|
|
800 }
|
|
801 else{
|
3
|
802
|
|
803 ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header
|
|
804
|
|
805 if ($vanilla){
|
|
806 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n";
|
|
807 }
|
|
808 else{ # SAM/BAM format
|
|
809
|
|
810 my $file = $filenames[0];
|
|
811 warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1);
|
|
812
|
|
813 ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file
|
|
814 if ($file =~ /\.gz$/){
|
|
815 open (DETERMINE,"zcat $file |") or die "Unable to read from gzipped file $file: $!\n";
|
|
816 }
|
|
817 elsif ($file =~ /\.bam$/ || `file -b $file` =~ /^gzip/){
|
|
818 open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM file $file: $!\n";
|
|
819 }
|
|
820 else{
|
|
821 open (DETERMINE,$file) or die "Unable to read from $file: $!\n";
|
|
822 }
|
|
823
|
|
824 while (<DETERMINE>){
|
|
825 last unless (/^\@/);
|
|
826 if ($_ =~ /^\@PG/){
|
|
827 # warn "found the \@PG line:\n";
|
|
828 # warn "$_";
|
|
829
|
|
830 if ($_ =~ /-1/ and $_ =~ /-2/){
|
|
831 warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1);
|
|
832 $paired_end = 1;
|
|
833 $single_end = 0;
|
|
834 }
|
|
835 else{
|
|
836 warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1);
|
|
837 $paired_end = 0;
|
|
838 $single_end = 1;
|
|
839 }
|
|
840 }
|
|
841 }
|
|
842
|
|
843 close DETERMINE or warn $!;
|
|
844
|
|
845 }
|
0
|
846 }
|
|
847
|
3
|
848 ### IGNORING <INT> bases at the start of the read when processing the methylation call string
|
|
849 unless ($ignore){
|
|
850 $ignore = 0;
|
|
851 }
|
|
852
|
|
853 if (defined $ignore_r2){
|
|
854 die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end);
|
|
855 }
|
|
856 else{
|
|
857 $ignore_r2 = 0;
|
|
858 }
|
|
859
|
|
860
|
0
|
861 ### NO OVERLAP
|
|
862 if ($no_overlap){
|
|
863 die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end);
|
|
864 }
|
|
865 else{
|
|
866 $no_overlap = 0;
|
|
867 }
|
|
868
|
|
869 ### COMPREHENSIVE OUTPUT
|
|
870 unless ($full){
|
|
871 $full = 0;
|
|
872 }
|
|
873
|
|
874 ### MERGE NON-CpG context
|
|
875 unless ($merge_non_CpG){
|
|
876 $merge_non_CpG = 0;
|
|
877 }
|
|
878
|
|
879 ### remove white spaces in read ID (needed for sorting using the sort command
|
|
880 unless ($remove){
|
|
881 $remove = 0;
|
|
882 }
|
|
883
|
|
884 ### COVERAGE THRESHOLD FOR bedGraph OUTPUT
|
|
885 if (defined $coverage_threshold){
|
|
886 unless ($coverage_threshold > 0){
|
|
887 die "Please select a coverage greater than 0 (positive integers only)\n";
|
|
888 }
|
|
889 }
|
|
890 else{
|
|
891 $coverage_threshold = 1;
|
|
892 }
|
|
893
|
|
894 ### SORT buffer size
|
|
895 if (defined $sort_size){
|
|
896 unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){
|
|
897 die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n";
|
|
898 }
|
|
899 }
|
|
900 else{
|
|
901 $sort_size = '2G';
|
|
902 }
|
|
903
|
|
904 if ($zero){
|
|
905 die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report);
|
|
906 }
|
|
907
|
|
908 if ($CX_context){
|
|
909 die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph);
|
|
910 }
|
|
911 else{
|
|
912 $CX_context = 0;
|
|
913 }
|
|
914
|
|
915 unless ($counts){
|
3
|
916 $counts = 1; # counts will always be set
|
0
|
917 }
|
|
918
|
|
919 if ($cytosine_report){
|
|
920
|
|
921 ### GENOME folder
|
|
922 if ($genome_folder){
|
|
923 unless ($genome_folder =~/\/$/){
|
|
924 $genome_folder =~ s/$/\//;
|
|
925 }
|
|
926 }
|
|
927 else{
|
|
928 die "Please specify a genome folder to proceed (full path only)\n";
|
|
929 }
|
|
930
|
|
931 unless ($bedGraph){
|
|
932 warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n";
|
|
933 $bedGraph = 1;
|
|
934 }
|
|
935 unless ($counts){
|
3
|
936 # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n";
|
0
|
937 $counts = 1;
|
|
938 }
|
|
939 warn "\n";
|
|
940 }
|
|
941
|
|
942 ### PATH TO SAMTOOLS
|
|
943 if (defined $samtools_path){
|
|
944 # if Samtools was specified as full command
|
|
945 if ($samtools_path =~ /samtools$/){
|
|
946 if (-e $samtools_path){
|
|
947 # Samtools executable found
|
|
948 }
|
|
949 else{
|
|
950 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
|
|
951 }
|
|
952 }
|
|
953 else{
|
|
954 unless ($samtools_path =~ /\/$/){
|
|
955 $samtools_path =~ s/$/\//;
|
|
956 }
|
|
957 $samtools_path .= 'samtools';
|
|
958 if (-e $samtools_path){
|
|
959 # Samtools executable found
|
|
960 }
|
|
961 else{
|
|
962 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
|
|
963 }
|
|
964 }
|
|
965 }
|
|
966 # Check whether Samtools is in the PATH if no path was supplied by the user
|
|
967 else{
|
|
968 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
|
|
969 $samtools_path = `which samtools`;
|
|
970 chomp $samtools_path;
|
|
971 }
|
|
972 }
|
|
973
|
|
974 unless (defined $samtools_path){
|
|
975 $samtools_path = '';
|
|
976 }
|
|
977
|
3
|
978
|
|
979 if ($gazillion){
|
|
980 if ($ample_mem){
|
|
981 die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n";
|
|
982 }
|
|
983 }
|
|
984
|
|
985 return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem);
|
|
986 }
|
|
987
|
|
988
|
|
989 sub test_positional_sorting{
|
|
990
|
|
991 my $filename = shift;
|
|
992
|
|
993 print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t";
|
|
994 sleep(1);
|
|
995
|
|
996 if ($filename =~ /\.gz$/) {
|
|
997 open (TEST,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
|
|
998 }
|
|
999 elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) {
|
|
1000 if ($samtools_path){
|
|
1001 open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
|
|
1002 }
|
|
1003 else{
|
|
1004 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
|
|
1005 }
|
|
1006 }
|
|
1007 else {
|
|
1008 open (TEST,$filename) or die "Can't open file $filename: $!\n";
|
|
1009 }
|
|
1010
|
|
1011 my $count = 0;
|
|
1012
|
|
1013 while (<TEST>) {
|
|
1014 if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted)
|
|
1015 if (/^\@SO/) {
|
|
1016 die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n";
|
|
1017 }
|
|
1018 next;
|
|
1019 }
|
|
1020 $count++;
|
|
1021
|
|
1022 last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID
|
|
1023
|
|
1024 my ($id_1) = (split (/\t/));
|
|
1025
|
|
1026 ### reading the next line which should be read 2
|
|
1027 $_ = <TEST>;
|
|
1028 my ($id_2) = (split (/\t/));
|
|
1029 last unless ($id_2);
|
|
1030 ++$count;
|
|
1031
|
|
1032 if ($id_1 eq $id_2){
|
|
1033 ### ids are the same
|
|
1034 next;
|
|
1035 }
|
|
1036 else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first
|
|
1037 my $id_1_trunc = $id_1;
|
|
1038 $id_1_trunc =~ s/\/1$//;
|
|
1039 my $id_2_trunc = $id_2;
|
|
1040 $id_2_trunc =~ s/\/2$//;
|
|
1041
|
|
1042 unless ($id_1_trunc eq $id_2_trunc){
|
|
1043 die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n";
|
|
1044 }
|
|
1045 }
|
|
1046 }
|
|
1047 # close TEST or die $!; somehow fails on our cluster...
|
|
1048 ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other)
|
|
1049 warn "...passed!\n";
|
|
1050 sleep(1);
|
|
1051
|
0
|
1052 }
|
|
1053
|
|
1054
|
|
1055 sub process_Bismark_results_file{
|
|
1056 my $filename = shift;
|
|
1057
|
|
1058 warn "\nNow reading in Bismark result file $filename\n\n";
|
|
1059
|
|
1060 if ($filename =~ /\.gz$/) {
|
|
1061 open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n";
|
|
1062 }
|
3
|
1063 elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) {
|
0
|
1064 if ($samtools_path){
|
|
1065 open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
|
|
1066 }
|
|
1067 else{
|
|
1068 die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
|
|
1069 }
|
|
1070 }
|
|
1071 else {
|
|
1072 open (IN,$filename) or die "Can't open file $filename: $!\n";
|
|
1073 }
|
|
1074
|
|
1075 ### Vanilla and SAM output need to read different numbers of header lines
|
|
1076 if ($vanilla) {
|
|
1077 my $bismark_version = <IN>; ## discarding the Bismark version info
|
|
1078 chomp $bismark_version;
|
|
1079 $bismark_version =~ s/\r//; # replaces \r line feed
|
|
1080 $bismark_version =~ s/Bismark version: //;
|
|
1081 if ($bismark_version =~ /^\@/) {
|
|
1082 warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n";
|
|
1083 sleep (2);
|
|
1084 }
|
|
1085
|
|
1086 unless ($version eq $bismark_version){
|
|
1087 die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n";
|
|
1088 }
|
|
1089 } else {
|
|
1090 # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly.
|
|
1091 # We are reading from it further down
|
|
1092 }
|
|
1093
|
|
1094 my $output_filename = (split (/\//,$filename))[-1];
|
|
1095
|
|
1096 ### OPENING OUTPUT-FILEHANDLES
|
|
1097 if ($report) {
|
|
1098 my $report_filename = $output_filename;
|
|
1099 $report_filename =~ s/\.sam$//;
|
|
1100 $report_filename =~ s/\.txt$//;
|
|
1101 $report_filename =~ s/$/_splitting_report.txt/;
|
|
1102 $report_filename = $output_dir . $report_filename;
|
|
1103 open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n";
|
|
1104 }
|
|
1105
|
|
1106 if ($report) {
|
|
1107 print REPORT "$output_filename\n\n";
|
|
1108 print REPORT "Parameters used to extract methylation information:\n";
|
|
1109 if ($paired) {
|
|
1110 if ($vanilla) {
|
|
1111 print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n";
|
|
1112 } else {
|
|
1113 print REPORT "Bismark result file: paired-end (SAM format)\n"; # default
|
|
1114 }
|
|
1115 }
|
|
1116
|
|
1117 if ($single) {
|
|
1118 if ($vanilla) {
|
|
1119 print REPORT "Bismark result file: single-end (vanilla Bismark format)\n";
|
|
1120 } else {
|
|
1121 print REPORT "Bismark result file: single-end (SAM format)\n"; # default
|
|
1122 }
|
|
1123 }
|
3
|
1124 if ($single){
|
|
1125 if ($ignore) {
|
|
1126 print REPORT "Ignoring first $ignore bp\n";
|
|
1127 }
|
|
1128 }
|
|
1129 else{ # paired-end
|
|
1130 if ($ignore) {
|
|
1131 print REPORT "Ignoring first $ignore bp of Read 1\n";
|
|
1132 }
|
|
1133 if ($ignore_r2){
|
|
1134 print REPORT "Ignoring first $ignore_r2 bp of Read 2\n";
|
|
1135 }
|
0
|
1136 }
|
|
1137
|
|
1138 if ($full) {
|
|
1139 print REPORT "Output specified: comprehensive\n";
|
|
1140 } else {
|
|
1141 print REPORT "Output specified: strand-specific (default)\n";
|
|
1142 }
|
|
1143
|
|
1144 if ($no_overlap) {
|
|
1145 print REPORT "No overlapping methylation calls specified\n";
|
|
1146 }
|
|
1147 if ($genomic_fasta) {
|
|
1148 print REPORT "Genomic equivalent sequences will be printed out in FastA format\n";
|
|
1149 }
|
|
1150 if ($merge_non_CpG) {
|
|
1151 print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n";
|
|
1152 }
|
|
1153
|
|
1154 print REPORT "\n";
|
|
1155 }
|
|
1156
|
|
1157 ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
|
|
1158
|
|
1159 ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL
|
|
1160 ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file
|
|
1161 if ($full and $merge_non_CpG) {
|
|
1162 my $cpg_output = my $other_c_output = $output_filename;
|
|
1163 ### C in CpG context
|
|
1164 $cpg_output =~ s/^/CpG_context_/;
|
|
1165 $cpg_output =~ s/sam$/txt/;
|
|
1166 $cpg_output =~ s/bam$/txt/;
|
|
1167 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
|
|
1168 $cpg_output = $output_dir . $cpg_output;
|
|
1169
|
|
1170 if ($gzip){
|
|
1171 $cpg_output .= '.gz';
|
3
|
1172 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
|
0
|
1173 }
|
|
1174 else{
|
3
|
1175 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
|
0
|
1176 }
|
|
1177
|
3
|
1178 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
|
0
|
1179 push @sorting_files,$cpg_output;
|
|
1180
|
|
1181 unless ($no_header) {
|
3
|
1182 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1183 }
|
|
1184
|
|
1185 ### C in any other context than CpG
|
|
1186 $other_c_output =~ s/^/Non_CpG_context_/;
|
|
1187 $other_c_output =~ s/sam$/txt/;
|
|
1188 $other_c_output =~ s/bam$/txt/;
|
|
1189 $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/);
|
|
1190 $other_c_output = $output_dir . $other_c_output;
|
|
1191
|
|
1192 if ($gzip){
|
|
1193 $other_c_output .= '.gz';
|
3
|
1194 open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only);
|
0
|
1195 }
|
|
1196 else{
|
3
|
1197 open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only);
|
0
|
1198 }
|
|
1199
|
3
|
1200 warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only);
|
0
|
1201 push @sorting_files,$other_c_output;
|
|
1202
|
|
1203
|
|
1204 unless ($no_header) {
|
3
|
1205 print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1206 }
|
|
1207 }
|
|
1208
|
|
1209 ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found
|
|
1210 elsif ($merge_non_CpG) {
|
|
1211
|
|
1212 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
|
|
1213
|
|
1214 ### For cytosines in CpG context
|
|
1215 $cpg_ot =~ s/^/CpG_OT_/;
|
|
1216 $cpg_ot =~ s/sam$/txt/;
|
|
1217 $cpg_ot =~ s/bam$/txt/;
|
|
1218 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
|
|
1219 $cpg_ot = $output_dir . $cpg_ot;
|
|
1220
|
|
1221 if ($gzip){
|
|
1222 $cpg_ot .= '.gz';
|
3
|
1223 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
|
0
|
1224 }
|
|
1225 else{
|
3
|
1226 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
|
0
|
1227 }
|
|
1228
|
3
|
1229 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
|
0
|
1230 push @sorting_files,$cpg_ot;
|
|
1231
|
|
1232 unless($no_header){
|
3
|
1233 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1234 }
|
|
1235
|
|
1236 $cpg_ctot =~ s/^/CpG_CTOT_/;
|
|
1237 $cpg_ctot =~ s/sam$/txt/;
|
|
1238 $cpg_ctot =~ s/bam$/txt/;
|
|
1239 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
|
|
1240 $cpg_ctot = $output_dir . $cpg_ctot;
|
|
1241
|
|
1242 if ($gzip){
|
|
1243 $cpg_ctot .= '.gz';
|
3
|
1244 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
|
0
|
1245 }
|
|
1246 else{
|
3
|
1247 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
|
0
|
1248 }
|
|
1249
|
3
|
1250 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
|
0
|
1251 push @sorting_files,$cpg_ctot;
|
|
1252
|
|
1253 unless($no_header){
|
3
|
1254 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1255 }
|
|
1256
|
|
1257 $cpg_ctob =~ s/^/CpG_CTOB_/;
|
|
1258 $cpg_ctob =~ s/sam$/txt/;
|
|
1259 $cpg_ctob =~ s/bam$/txt/;
|
|
1260 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
|
|
1261 $cpg_ctob = $output_dir . $cpg_ctob;
|
|
1262
|
|
1263 if ($gzip){
|
|
1264 $cpg_ctob .= '.gz';
|
3
|
1265 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
|
0
|
1266 }
|
|
1267 else{
|
3
|
1268 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
|
0
|
1269 }
|
|
1270
|
3
|
1271 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
|
0
|
1272 push @sorting_files,$cpg_ctob;
|
|
1273
|
|
1274 unless($no_header){
|
3
|
1275 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1276 }
|
|
1277
|
|
1278 $cpg_ob =~ s/^/CpG_OB_/;
|
|
1279 $cpg_ob =~ s/sam$/txt/;
|
|
1280 $cpg_ob =~ s/bam$/txt/;
|
|
1281 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
|
|
1282 $cpg_ob = $output_dir . $cpg_ob;
|
|
1283
|
|
1284 if ($gzip){
|
|
1285 $cpg_ob .= '.gz';
|
3
|
1286 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
|
0
|
1287 }
|
|
1288 else{
|
3
|
1289 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
|
0
|
1290 }
|
|
1291
|
3
|
1292 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
|
0
|
1293 push @sorting_files,$cpg_ob;
|
|
1294
|
|
1295 unless($no_header){
|
3
|
1296 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1297 }
|
|
1298
|
|
1299 ### For cytosines in Non-CpG (CC, CT or CA) context
|
|
1300 my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename;
|
|
1301
|
|
1302 $other_c_ot =~ s/^/Non_CpG_OT_/;
|
|
1303 $other_c_ot =~ s/sam$/txt/;
|
|
1304 $other_c_ot =~ s/bam$/txt/;
|
|
1305 $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/);
|
|
1306 $other_c_ot = $output_dir . $other_c_ot;
|
|
1307
|
|
1308 if ($gzip){
|
|
1309 $other_c_ot .= '.gz';
|
3
|
1310 open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
|
0
|
1311 }
|
|
1312 else{
|
3
|
1313 open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only);
|
0
|
1314 }
|
|
1315
|
3
|
1316 warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only);
|
0
|
1317 push @sorting_files,$other_c_ot;
|
|
1318
|
|
1319 unless($no_header){
|
3
|
1320 print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1321 }
|
|
1322
|
|
1323 $other_c_ctot =~ s/^/Non_CpG_CTOT_/;
|
|
1324 $other_c_ctot =~ s/sam$/txt/;
|
|
1325 $other_c_ctot =~ s/bam$/txt/;
|
|
1326 $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/);
|
|
1327 $other_c_ctot = $output_dir . $other_c_ctot;
|
|
1328
|
|
1329 if ($gzip){
|
|
1330 $other_c_ctot .= '.gz';
|
3
|
1331 open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
|
0
|
1332 }
|
|
1333 else{
|
3
|
1334 open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only);
|
0
|
1335 }
|
|
1336
|
3
|
1337 warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only);
|
0
|
1338 push @sorting_files,$other_c_ctot;
|
|
1339
|
|
1340 unless($no_header){
|
3
|
1341 print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1342 }
|
|
1343
|
|
1344 $other_c_ctob =~ s/^/Non_CpG_CTOB_/;
|
|
1345 $other_c_ctob =~ s/sam$/txt/;
|
|
1346 $other_c_ctob =~ s/bam$/txt/;
|
|
1347 $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/);
|
|
1348 $other_c_ctob = $output_dir . $other_c_ctob;
|
|
1349
|
|
1350 if ($gzip){
|
|
1351 $other_c_ctob .= '.gz';
|
3
|
1352 open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
|
0
|
1353 }
|
|
1354 else{
|
3
|
1355 open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only);
|
0
|
1356 }
|
|
1357
|
3
|
1358 warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only);
|
0
|
1359 push @sorting_files,$other_c_ctob;
|
|
1360
|
|
1361 unless($no_header){
|
3
|
1362 print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1363 }
|
|
1364
|
|
1365 $other_c_ob =~ s/^/Non_CpG_OB_/;
|
|
1366 $other_c_ob =~ s/sam$/txt/;
|
|
1367 $other_c_ob =~ s/sam$/txt/;
|
|
1368 $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/);
|
|
1369 $other_c_ob = $output_dir . $other_c_ob;
|
|
1370
|
|
1371 if ($gzip){
|
|
1372 $other_c_ob .= '.gz';
|
3
|
1373 open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
|
0
|
1374 }
|
|
1375 else{
|
3
|
1376 open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only);
|
0
|
1377 }
|
|
1378
|
3
|
1379 warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only);
|
0
|
1380 push @sorting_files,$other_c_ob;
|
|
1381
|
|
1382 unless($no_header){
|
3
|
1383 print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1384 }
|
|
1385 }
|
|
1386 ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context)
|
|
1387
|
|
1388 ### if --comprehensive was specified we are only writing one file per context
|
|
1389 elsif ($full) {
|
|
1390 my $cpg_output = my $chg_output = my $chh_output = $output_filename;
|
|
1391 ### C in CpG context
|
|
1392 $cpg_output =~ s/^/CpG_context_/;
|
|
1393 $cpg_output =~ s/sam$/txt/;
|
|
1394 $cpg_output =~ s/bam$/txt/;
|
|
1395 $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/);
|
|
1396 $cpg_output = $output_dir . $cpg_output;
|
|
1397
|
|
1398 if ($gzip){
|
|
1399 $cpg_output .= '.gz';
|
3
|
1400 open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
|
0
|
1401 }
|
|
1402 else{
|
3
|
1403 open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only);
|
0
|
1404 }
|
|
1405
|
3
|
1406 warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only);
|
0
|
1407 push @sorting_files,$cpg_output;
|
|
1408
|
|
1409 unless($no_header){
|
3
|
1410 print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1411 }
|
|
1412
|
|
1413 ### C in CHG context
|
|
1414 $chg_output =~ s/^/CHG_context_/;
|
|
1415 $chg_output =~ s/sam$/txt/;
|
|
1416 $chg_output =~ s/bam$/txt/;
|
|
1417 $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/);
|
|
1418 $chg_output = $output_dir . $chg_output;
|
|
1419
|
|
1420 if ($gzip){
|
|
1421 $chg_output .= '.gz';
|
3
|
1422 open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only);
|
0
|
1423 }
|
|
1424 else{
|
3
|
1425 open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only);
|
0
|
1426 }
|
|
1427
|
3
|
1428 warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only);
|
0
|
1429 push @sorting_files,$chg_output;
|
|
1430
|
|
1431 unless($no_header){
|
3
|
1432 print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1433 }
|
|
1434
|
|
1435 ### C in CHH context
|
|
1436 $chh_output =~ s/^/CHH_context_/;
|
|
1437 $chh_output =~ s/sam$/txt/;
|
|
1438 $chh_output =~ s/bam$/txt/;
|
|
1439 $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/);
|
|
1440 $chh_output = $output_dir . $chh_output;
|
|
1441
|
|
1442 if ($gzip){
|
|
1443 $chh_output .= '.gz';
|
3
|
1444 open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only);
|
0
|
1445 }
|
|
1446 else{
|
3
|
1447 open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only);
|
0
|
1448 }
|
|
1449
|
3
|
1450 warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only);
|
0
|
1451 push @sorting_files, $chh_output;
|
|
1452
|
|
1453 unless($no_header){
|
3
|
1454 print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1455 }
|
|
1456 }
|
|
1457 ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found
|
|
1458 else {
|
|
1459 my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename;
|
|
1460
|
|
1461 ### For cytosines in CpG context
|
|
1462 $cpg_ot =~ s/^/CpG_OT_/;
|
|
1463 $cpg_ot =~ s/sam$/txt/;
|
|
1464 $cpg_ot =~ s/bam$/txt/;
|
|
1465 $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/);
|
|
1466 $cpg_ot = $output_dir . $cpg_ot;
|
|
1467
|
|
1468 if ($gzip){
|
|
1469 $cpg_ot .= '.gz';
|
3
|
1470 open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
|
0
|
1471 }
|
|
1472 else{
|
3
|
1473 open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only);
|
0
|
1474 }
|
|
1475
|
3
|
1476 warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only);
|
0
|
1477 push @sorting_files,$cpg_ot;
|
|
1478
|
|
1479 unless($no_header){
|
3
|
1480 print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1481 }
|
|
1482
|
|
1483 $cpg_ctot =~ s/^/CpG_CTOT_/;
|
|
1484 $cpg_ctot =~ s/sam$/txt/;
|
|
1485 $cpg_ctot =~ s/bam$/txt/;
|
|
1486 $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/);
|
|
1487 $cpg_ctot = $output_dir . $cpg_ctot;
|
|
1488
|
|
1489 if ($gzip){
|
|
1490 $cpg_ctot .= '.gz';
|
3
|
1491 open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
|
0
|
1492 }
|
|
1493 else{
|
3
|
1494 open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only);
|
0
|
1495 }
|
|
1496
|
3
|
1497 warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only);
|
0
|
1498 push @sorting_files,$cpg_ctot;
|
|
1499
|
|
1500 unless($no_header){
|
3
|
1501 print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1502 }
|
|
1503
|
|
1504 $cpg_ctob =~ s/^/CpG_CTOB_/;
|
|
1505 $cpg_ctob =~ s/sam$/txt/;
|
|
1506 $cpg_ctob =~ s/bam$/txt/;
|
|
1507 $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/);
|
|
1508 $cpg_ctob = $output_dir . $cpg_ctob;
|
|
1509
|
|
1510 if ($gzip){
|
|
1511 $cpg_ctob .= '.gz';
|
3
|
1512 open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
|
0
|
1513 }
|
|
1514 else{
|
3
|
1515 open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only);
|
0
|
1516 }
|
|
1517
|
3
|
1518 warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only);
|
0
|
1519 push @sorting_files,$cpg_ctob;
|
|
1520
|
|
1521 unless($no_header){
|
3
|
1522 print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1523 }
|
|
1524
|
|
1525 $cpg_ob =~ s/^/CpG_OB_/;
|
|
1526 $cpg_ob =~ s/sam$/txt/;
|
|
1527 $cpg_ob =~ s/bam$/txt/;
|
|
1528 $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/);
|
|
1529 $cpg_ob = $output_dir . $cpg_ob;
|
|
1530
|
|
1531 if ($gzip){
|
|
1532 $cpg_ob .= '.gz';
|
3
|
1533 open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
|
0
|
1534 }
|
|
1535 else{
|
3
|
1536 open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only);
|
0
|
1537 }
|
|
1538
|
3
|
1539 warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only);
|
0
|
1540 push @sorting_files,$cpg_ob;
|
|
1541
|
|
1542 unless($no_header){
|
3
|
1543 print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1544 }
|
|
1545
|
|
1546 ### For cytosines in CHG context
|
|
1547 my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename;
|
|
1548
|
|
1549 $chg_ot =~ s/^/CHG_OT_/;
|
|
1550 $chg_ot =~ s/sam$/txt/;
|
|
1551 $chg_ot =~ s/bam$/txt/;
|
|
1552 $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/);
|
|
1553 $chg_ot = $output_dir . $chg_ot;
|
|
1554
|
|
1555 if ($gzip){
|
|
1556 $chg_ot .= '.gz';
|
3
|
1557 open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
|
0
|
1558 }
|
|
1559 else{
|
3
|
1560 open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only);
|
0
|
1561 }
|
|
1562
|
3
|
1563 warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only);
|
0
|
1564 push @sorting_files,$chg_ot;
|
|
1565
|
|
1566 unless($no_header){
|
3
|
1567 print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1568 }
|
|
1569
|
|
1570 $chg_ctot =~ s/^/CHG_CTOT_/;
|
|
1571 $chg_ctot =~ s/sam$/txt/;
|
|
1572 $chg_ctot =~ s/bam$/txt/;
|
|
1573 $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/);
|
|
1574 $chg_ctot = $output_dir . $chg_ctot;
|
|
1575
|
|
1576 if ($gzip){
|
|
1577 $chg_ctot .= '.gz';
|
3
|
1578 open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
|
0
|
1579 }
|
|
1580 else{
|
3
|
1581 open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only);
|
0
|
1582 }
|
|
1583
|
3
|
1584 warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only);
|
0
|
1585 push @sorting_files,$chg_ctot;
|
|
1586
|
|
1587 unless($no_header){
|
3
|
1588 print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1589 }
|
|
1590
|
|
1591 $chg_ctob =~ s/^/CHG_CTOB_/;
|
|
1592 $chg_ctob =~ s/sam$/txt/;
|
|
1593 $chg_ctob =~ s/bam$/txt/;
|
|
1594 $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/);
|
|
1595 $chg_ctob = $output_dir . $chg_ctob;
|
|
1596
|
|
1597 if ($gzip){
|
|
1598 $chg_ctob .= '.gz';
|
3
|
1599 open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
|
0
|
1600 }
|
|
1601 else{
|
3
|
1602 open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only);
|
0
|
1603 }
|
|
1604
|
3
|
1605 warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only);
|
0
|
1606 push @sorting_files,$chg_ctob;
|
|
1607
|
|
1608 unless($no_header){
|
3
|
1609 print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1610 }
|
|
1611
|
|
1612 $chg_ob =~ s/^/CHG_OB_/;
|
|
1613 $chg_ob =~ s/sam$/txt/;
|
|
1614 $chg_ob =~ s/bam$/txt/;
|
|
1615 $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/);
|
|
1616 $chg_ob = $output_dir . $chg_ob;
|
|
1617
|
|
1618 if ($gzip){
|
|
1619 $chg_ob .= '.gz';
|
3
|
1620 open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
|
0
|
1621 }
|
|
1622 else{
|
3
|
1623 open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only);
|
0
|
1624 }
|
|
1625
|
3
|
1626 warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only);
|
0
|
1627 push @sorting_files,$chg_ob;
|
|
1628
|
|
1629 unless($no_header){
|
3
|
1630 print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1631 }
|
|
1632
|
|
1633 ### For cytosines in CHH context
|
|
1634 my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename;
|
|
1635
|
|
1636 $chh_ot =~ s/^/CHH_OT_/;
|
|
1637 $chh_ot =~ s/sam$/txt/;
|
|
1638 $chh_ot =~ s/bam$/txt/;
|
|
1639 $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/);
|
|
1640 $chh_ot = $output_dir . $chh_ot;
|
|
1641
|
|
1642 if ($gzip){
|
|
1643 $chh_ot .= '.gz';
|
3
|
1644 open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
|
0
|
1645 }
|
|
1646 else{
|
3
|
1647 open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only);
|
0
|
1648 }
|
|
1649
|
3
|
1650 warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only);
|
0
|
1651 push @sorting_files,$chh_ot;
|
|
1652
|
|
1653 unless($no_header){
|
3
|
1654 print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1655 }
|
|
1656
|
|
1657 $chh_ctot =~ s/^/CHH_CTOT_/;
|
|
1658 $chh_ctot =~ s/sam$/txt/;
|
|
1659 $chh_ctot =~ s/bam$/txt/;
|
|
1660 $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/);
|
|
1661 $chh_ctot = $output_dir . $chh_ctot;
|
|
1662
|
|
1663 if ($gzip){
|
|
1664 $chh_ctot .= '.gz';
|
3
|
1665 open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
|
0
|
1666 }
|
|
1667 else{
|
3
|
1668 open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only);
|
0
|
1669 }
|
|
1670
|
3
|
1671 warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only);
|
0
|
1672 push @sorting_files,$chh_ctot;
|
|
1673
|
|
1674 unless($no_header){
|
3
|
1675 print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1676 }
|
|
1677
|
|
1678 $chh_ctob =~ s/^/CHH_CTOB_/;
|
|
1679 $chh_ctob =~ s/sam$/txt/;
|
|
1680 $chh_ctob =~ s/bam$/txt/;
|
|
1681 $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/);
|
|
1682 $chh_ctob = $output_dir . $chh_ctob;
|
|
1683
|
|
1684 if ($gzip){
|
|
1685 $chh_ctob .= '.gz';
|
3
|
1686 open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
|
0
|
1687 }
|
|
1688 else{
|
3
|
1689 open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only);
|
0
|
1690 }
|
|
1691
|
3
|
1692 warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only);
|
0
|
1693 push @sorting_files,$chh_ctob;
|
|
1694
|
|
1695 unless($no_header){
|
3
|
1696 print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1697 }
|
|
1698
|
|
1699 $chh_ob =~ s/^/CHH_OB_/;
|
|
1700 $chh_ob =~ s/sam$/txt/;
|
|
1701 $chh_ob =~ s/bam$/txt/;
|
|
1702 $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/);
|
|
1703 $chh_ob = $output_dir . $chh_ob;
|
|
1704
|
|
1705 if ($gzip){
|
|
1706 $chh_ob .= '.gz';
|
3
|
1707 open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
|
0
|
1708 }
|
|
1709 else{
|
3
|
1710 open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only);
|
0
|
1711 }
|
|
1712
|
3
|
1713 warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only);
|
0
|
1714 push @sorting_files,$chh_ob;
|
|
1715
|
|
1716 unless($no_header){
|
3
|
1717 print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only);
|
0
|
1718 }
|
|
1719 }
|
|
1720
|
|
1721 my $methylation_call_strings_processed = 0;
|
|
1722 my $line_count = 0;
|
|
1723
|
|
1724 ### proceeding differently now for single-end or paired-end Bismark files
|
|
1725
|
|
1726 ### PROCESSING SINGLE-END RESULT FILES
|
|
1727 if ($single) {
|
|
1728
|
|
1729 ### also proceeding differently now for SAM format or vanilla Bismark format files
|
|
1730 if ($vanilla) { # old vanilla Bismark output format
|
|
1731 while (<IN>) {
|
|
1732 ++$line_count;
|
|
1733 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
|
|
1734
|
|
1735 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
|
|
1736 my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9];
|
|
1737
|
|
1738 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or
|
|
1739 ### last position
|
|
1740 chomp $genome_conversion;
|
|
1741
|
|
1742 my $index;
|
|
1743 if ($meth_call) {
|
|
1744
|
|
1745 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
|
|
1746 $index = 0;
|
|
1747 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
|
|
1748 $index = 1;
|
|
1749 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
|
|
1750 $index = 3;
|
|
1751 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
|
|
1752 $index = 2;
|
|
1753 } else {
|
|
1754 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
|
|
1755 }
|
|
1756
|
|
1757 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
|
|
1758 if ($ignore) {
|
|
1759 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
|
|
1760
|
|
1761 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
|
|
1762 if ($strand eq '+') {
|
|
1763 $start += $ignore;
|
|
1764 } elsif ($strand eq '-') {
|
|
1765 $start += length($meth_call)-1; ## $meth_call is already shortened!
|
|
1766 } else {
|
|
1767 die "Alignment did not have proper strand information: $strand\n";
|
|
1768 }
|
|
1769 }
|
|
1770 ### printing out the methylation state of every C in the read
|
|
1771 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
|
|
1772
|
|
1773 ++$methylation_call_strings_processed; # 1 per single-end result
|
|
1774 }
|
|
1775 }
|
|
1776 } else { # processing single-end SAM format (default)
|
|
1777 while (<IN>) {
|
|
1778 ### SAM format can either start with header lines (starting with @) or start with alignments directly
|
|
1779 if (/^\@/) { # skipping header lines (starting with @)
|
|
1780 warn "skipping SAM header line:\t$_";
|
|
1781 next;
|
|
1782 }
|
|
1783
|
|
1784 ++$line_count;
|
|
1785 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
|
|
1786
|
|
1787 # example read in SAM format
|
|
1788 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
|
|
1789 ###
|
|
1790
|
|
1791 # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
|
|
1792 # < 0.7.6 $meth_call =~ s/^XM:Z://;
|
|
1793 # < 0.7.6 $read_conversion =~ s/^XR:Z://;
|
|
1794 # < 0.7.6 $genome_conversion =~ s/^XG:Z://;
|
|
1795
|
|
1796 my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5];
|
|
1797
|
|
1798 ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
|
|
1799 my $meth_call; ### Thanks to Zachary Zeno for this solution
|
|
1800 my $read_conversion;
|
|
1801 my $genome_conversion;
|
|
1802
|
|
1803 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
|
|
1804 my $tag = $1;
|
|
1805 my $value = $2;
|
|
1806
|
|
1807 if ($tag eq "XM") {
|
|
1808 $meth_call = $value;
|
|
1809 $meth_call =~ s/\r//;
|
|
1810 } elsif ($tag eq "XR") {
|
|
1811 $read_conversion = $value;
|
|
1812 $read_conversion =~ s/\r//;
|
|
1813 } elsif ($tag eq "XG") {
|
|
1814 $genome_conversion = $value;
|
|
1815 $genome_conversion =~ s/\r//;
|
|
1816 }
|
|
1817 }
|
|
1818
|
|
1819 my $strand;
|
|
1820 chomp $genome_conversion;
|
|
1821 # print "$meth_call\n$read_conversion\n$genome_conversion\n";
|
|
1822
|
|
1823 my $index;
|
|
1824 if ($meth_call) {
|
|
1825 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
|
|
1826 $index = 0;
|
|
1827 $strand = '+';
|
|
1828 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
|
|
1829 $index = 1;
|
|
1830 $strand = '-';
|
|
1831 } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
|
|
1832 $index = 2;
|
|
1833 $strand = '+';
|
|
1834 } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
|
|
1835 $index = 3;
|
|
1836 $strand = '-';
|
|
1837 } else {
|
|
1838 die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
|
|
1839 }
|
|
1840
|
|
1841 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
|
|
1842 if ($strand eq '-') {
|
|
1843 $meth_call = reverse $meth_call;
|
|
1844 }
|
|
1845
|
|
1846 ### Clipping off the first <int> number of bases from the methylation call string as specified with --ignore <int>
|
|
1847 if ($ignore) {
|
|
1848 # print "\n\n$meth_call\n";
|
|
1849 $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore);
|
|
1850 # print "$meth_call\n";
|
3
|
1851
|
0
|
1852 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
|
|
1853
|
|
1854 my @len = split (/\D+/,$cigar); # storing the length per operation
|
|
1855 my @ops = split (/\d+/,$cigar); # storing the operation
|
|
1856 shift @ops; # remove the empty first element
|
|
1857 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
|
1858
|
|
1859 my @comp_cigar; # building an array with all CIGAR operations
|
|
1860 foreach my $index (0..$#len) {
|
|
1861 foreach (1..$len[$index]) {
|
|
1862 # print "$ops[$index]";
|
|
1863 push @comp_cigar, $ops[$index];
|
|
1864 }
|
|
1865 }
|
|
1866 # print "original CIGAR: $cigar\n";
|
|
1867 # print "original CIGAR: @comp_cigar\n";
|
|
1868
|
|
1869 ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly!
|
|
1870 if ($strand eq '+') {
|
|
1871
|
|
1872 my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions
|
|
1873 my $I_count = 0;
|
|
1874
|
|
1875 for (1..$ignore) {
|
|
1876 my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start
|
|
1877 # print "$_ deleted $op\n";
|
|
1878
|
|
1879 while ($op eq 'D') { # repeating this for deletions (D)
|
|
1880 $D_count++;
|
|
1881 $op = shift @comp_cigar;
|
|
1882 # print "$_ deleted $op\n";
|
|
1883 }
|
|
1884 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
|
|
1885 $I_count++;
|
|
1886 }
|
|
1887 }
|
|
1888 $start += $ignore + $D_count - $I_count;
|
|
1889 # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n";
|
|
1890 } elsif ($strand eq '-') {
|
|
1891
|
|
1892 for (1..$ignore) {
|
|
1893 my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
|
|
1894 while ($op eq 'D') { # repeating this for deletions (D)
|
|
1895 $op = pop @comp_cigar;
|
|
1896 }
|
|
1897 }
|
|
1898
|
|
1899 ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR
|
|
1900 ### string to be able to work out the starting position of the read which is on the 3' end of the sequence
|
|
1901 my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position
|
|
1902 foreach (@comp_cigar) {
|
|
1903 ++$MD_count if ($_ eq 'M' or $_ eq 'D');
|
|
1904 }
|
|
1905 $start += $MD_count - 1;
|
|
1906 }
|
|
1907
|
|
1908 ### reconstituting shortened CIGAR string
|
|
1909 my $new_cigar;
|
|
1910 my $count = 0;
|
|
1911 my $last_op;
|
|
1912 # print "ignore adjusted: @comp_cigar\n";
|
|
1913 foreach my $op (@comp_cigar) {
|
|
1914 unless (defined $last_op){
|
|
1915 $last_op = $op;
|
|
1916 ++$count;
|
|
1917 next;
|
|
1918 }
|
|
1919 if ($last_op eq $op) {
|
|
1920 ++$count;
|
|
1921 } else {
|
|
1922 $new_cigar .= "$count$last_op";
|
|
1923 $last_op = $op;
|
|
1924 $count = 1;
|
|
1925 }
|
|
1926 }
|
|
1927 $new_cigar .= "$count$last_op"; # appending the last operation and count
|
|
1928 $cigar = $new_cigar;
|
|
1929 # print "ignore adjusted scalar: $cigar\n";
|
|
1930 }
|
|
1931 }
|
|
1932 ### printing out the methylation state of every C in the read
|
|
1933 print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar);
|
|
1934
|
|
1935 ++$methylation_call_strings_processed; # 1 per single-end result
|
|
1936 }
|
|
1937 }
|
|
1938 }
|
|
1939
|
|
1940 ### PROCESSING PAIRED-END RESULT FILES
|
|
1941 elsif ($paired) {
|
|
1942
|
|
1943 ### proceeding differently now for SAM format or vanilla Bismark format files
|
|
1944 if ($vanilla) { # old vanilla Bismark paired-end output format
|
|
1945 while (<IN>) {
|
|
1946 ++$line_count;
|
|
1947 warn "processed line: $line_count\n" if ($line_count%500000==0);
|
|
1948
|
|
1949 ### $seq here is the chromosomal sequence (to use for the repeat analysis for example)
|
|
1950 my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13];
|
|
1951
|
|
1952 my $index;
|
|
1953 chomp $genome_conversion;
|
|
1954
|
|
1955 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
|
|
1956 $index = 0; ## this is OT
|
|
1957 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
|
|
1958 $index = 2; ## this is CTOB!!!
|
|
1959 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
|
|
1960 $index = 1; ## this is CTOT!!!
|
|
1961 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
|
|
1962 $index = 3; ## this is OB
|
|
1963 } else {
|
|
1964 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
|
|
1965 }
|
|
1966
|
|
1967 if ($meth_call_1 and $meth_call_2) {
|
|
1968 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
|
3
|
1969
|
0
|
1970 if ($ignore) {
|
|
1971 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
|
3
|
1972
|
0
|
1973 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified
|
|
1974 $start_read_1 += $ignore;
|
|
1975 }
|
3
|
1976 if ($ignore_r2) {
|
|
1977 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
|
|
1978
|
|
1979 ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified
|
|
1980 $end_read_2 -= $ignore_r2;
|
|
1981 }
|
|
1982
|
0
|
1983 my $end_read_1;
|
|
1984 my $start_read_2;
|
|
1985
|
|
1986 if ($strand eq '+') {
|
|
1987
|
|
1988 $end_read_1 = $start_read_1+length($meth_call_1)-1;
|
|
1989 $start_read_2 = $end_read_2-length($meth_call_2)+1;
|
3
|
1990
|
0
|
1991 ## we first pass the first read which is in + orientation on the forward strand
|
3
|
1992 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity
|
0
|
1993
|
|
1994 # we next pass the second read which is in - orientation on the reverse strand
|
|
1995 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
|
3
|
1996 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2);
|
|
1997 }
|
|
1998 else {
|
0
|
1999
|
|
2000 $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read!
|
|
2001 $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read!
|
|
2002
|
|
2003 ## we first pass the first read which is in - orientation on the reverse strand
|
3
|
2004 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1);
|
0
|
2005
|
|
2006 # we next pass the second read which is in + orientation on the forward strand
|
|
2007 ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
|
3
|
2008 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2);
|
0
|
2009 }
|
|
2010
|
|
2011 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
|
|
2012 }
|
|
2013 }
|
3
|
2014 }
|
|
2015 else { # Bismark paired-end SAM output format (default)
|
0
|
2016 while (<IN>) {
|
|
2017 ### SAM format can either start with header lines (starting with @) or start with alignments directly
|
|
2018 if (/^\@/) { # skipping header lines (starting with @)
|
|
2019 warn "skipping SAM header line:\t$_";
|
|
2020 next;
|
|
2021 }
|
|
2022
|
|
2023 ++$line_count;
|
|
2024 warn "Processed lines: $line_count\n" if ($line_count%500000==0);
|
|
2025
|
|
2026 # example paired-end reads in SAM format (2 consecutive lines)
|
|
2027 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT
|
|
2028 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT
|
|
2029
|
|
2030 # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
|
|
2031
|
|
2032 my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
|
|
2033 my $meth_call_1;
|
|
2034 my $first_read_conversion;
|
|
2035 my $genome_conversion;
|
|
2036
|
|
2037 while ( /(XM|XR|XG):Z:([^\t]+)/g ) {
|
|
2038 my $tag = $1;
|
|
2039 my $value = $2;
|
|
2040
|
|
2041 if ($tag eq "XM") {
|
|
2042 $meth_call_1 = $value;
|
|
2043 $meth_call_1 =~ s/\r//;
|
|
2044 } elsif ($tag eq "XR") {
|
|
2045 $first_read_conversion = $value;
|
|
2046 $first_read_conversion =~ s/\r//;
|
|
2047 } elsif ($tag eq "XG") {
|
|
2048 $genome_conversion = $value;
|
|
2049 $genome_conversion =~ s/\r//;
|
|
2050 }
|
|
2051 }
|
|
2052
|
|
2053 $_ = <IN>; # reading in the paired read
|
|
2054
|
|
2055 # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14];
|
|
2056 # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://;
|
|
2057 # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://;
|
|
2058 # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://;
|
|
2059 # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://;
|
|
2060
|
|
2061 my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression
|
|
2062
|
|
2063 my $meth_call_2;
|
|
2064 my $second_read_conversion;
|
|
2065
|
|
2066 while ( /(XM|XR):Z:([^\t]+)/g ) {
|
|
2067 my $tag = $1;
|
|
2068 my $value = $2;
|
|
2069
|
|
2070 if ($tag eq "XM") {
|
|
2071 $meth_call_2 = $value;
|
|
2072 $meth_call_2 =~ s/\r//;
|
|
2073 } elsif ($tag eq "XR") {
|
|
2074 $second_read_conversion = $value;
|
|
2075 $second_read_conversion = s/\r//;
|
|
2076 }
|
|
2077 }
|
|
2078
|
|
2079 # < version 0.7.6 $genome_conversion =~ s/^XG:Z://;
|
|
2080 chomp $genome_conversion; # in case it captured a new line character
|
|
2081
|
|
2082 # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n";
|
|
2083
|
|
2084 my $index;
|
|
2085 my $strand;
|
|
2086
|
|
2087 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') {
|
|
2088 $index = 0; ## this is OT
|
|
2089 $strand = '+';
|
|
2090 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') {
|
|
2091 $index = 1; ## this is CTOT
|
|
2092 $strand = '-';
|
|
2093 } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') {
|
|
2094 $index = 2; ## this is CTOB
|
|
2095 $strand = '+';
|
|
2096 } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') {
|
|
2097 $index = 3; ## this is OB
|
|
2098 $strand = '-';
|
|
2099 } else {
|
|
2100 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
|
|
2101 }
|
|
2102
|
|
2103 ### reversing the methylation call of the read that was reverse-complemented
|
|
2104 if ($strand eq '+') {
|
|
2105 $meth_call_2 = reverse $meth_call_2;
|
|
2106 } else {
|
|
2107 $meth_call_1 = reverse $meth_call_1;
|
|
2108 }
|
|
2109
|
|
2110 if ($meth_call_1 and $meth_call_2) {
|
|
2111
|
|
2112 my $end_read_1;
|
|
2113
|
|
2114 ### READ 1
|
|
2115 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
|
|
2116 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
|
|
2117 shift @ops_1; # remove the empty first element
|
3
|
2118
|
|
2119 die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1);
|
0
|
2120
|
|
2121 my @comp_cigar_1; # building an array with all CIGAR operations
|
|
2122 foreach my $index (0..$#len_1) {
|
|
2123 foreach (1..$len_1[$index]) {
|
|
2124 # print "$ops_1[$index]";
|
|
2125 push @comp_cigar_1, $ops_1[$index];
|
|
2126 }
|
|
2127 }
|
|
2128 # print "original CIGAR read 1: $cigar_1\n";
|
|
2129 # print "original CIGAR read 1: @comp_cigar_1\n";
|
|
2130
|
|
2131 ### READ 2
|
|
2132 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
|
|
2133 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
|
|
2134 shift @ops_2; # remove the empty first element
|
|
2135 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
|
|
2136 my @comp_cigar_2; # building an array with all CIGAR operations for read 2
|
|
2137 foreach my $index (0..$#len_2) {
|
|
2138 foreach (1..$len_2[$index]) {
|
|
2139 # print "$ops_2[$index]";
|
|
2140 push @comp_cigar_2, $ops_2[$index];
|
|
2141 }
|
|
2142 }
|
|
2143 # print "original CIGAR read 2: $cigar_2\n";
|
|
2144 # print "original CIGAR read 2: @comp_cigar_2\n";
|
|
2145
|
3
|
2146
|
|
2147
|
0
|
2148 if ($ignore) {
|
3
|
2149 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' for read 1
|
0
|
2150 ### the methylation calls have already been reversed where necessary
|
|
2151 $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore);
|
|
2152
|
|
2153 if ($strand eq '+') {
|
|
2154
|
|
2155 ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start
|
|
2156 my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions
|
|
2157 my $I_count_1 = 0;
|
3
|
2158
|
0
|
2159 for (1..$ignore) {
|
|
2160 my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start
|
|
2161 # print "$_ deleted $op\n";
|
|
2162
|
|
2163 while ($op eq 'D') { # repeating this for deletions (D)
|
|
2164 $D_count_1++;
|
|
2165 $op = shift @comp_cigar_1;
|
|
2166 # print "$_ deleted $op\n";
|
|
2167 }
|
|
2168 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
|
|
2169 $I_count_1++;
|
|
2170 }
|
|
2171 }
|
|
2172
|
|
2173 $start_read_1 += $ignore + $D_count_1 - $I_count_1;
|
|
2174 # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n";
|
3
|
2175
|
0
|
2176 # the start position of reads mapping to the reverse strand is being adjusted further below
|
3
|
2177 }
|
|
2178 elsif ($strand eq '-') {
|
0
|
2179
|
|
2180 ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back
|
|
2181 for (1..$ignore) {
|
|
2182 my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
|
|
2183 while ($op eq 'D') { # repeating this for deletions (D)
|
|
2184 $op = pop @comp_cigar_1;
|
|
2185 }
|
|
2186 }
|
|
2187 # the start position of reads mapping to the reverse strand is being adjusted further below
|
|
2188
|
3
|
2189 }
|
|
2190 }
|
|
2191
|
|
2192 if ($ignore_r2) {
|
|
2193 ### Clipping off the first <int> number of bases from the methylation call string as specified with '--ignore_r2 <int>' for read 2
|
|
2194 ### the methylation calls have already been reversed where necessary
|
|
2195 $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2);
|
|
2196
|
|
2197 ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly
|
|
2198
|
|
2199 if ($strand eq '+') {
|
|
2200
|
|
2201 ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back
|
|
2202
|
|
2203 for (1..$ignore_r2) {
|
|
2204 my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array
|
|
2205 while ($op eq 'D') { # repeating this for deletions (D)
|
|
2206 $op = pop @comp_cigar_2;
|
|
2207 }
|
|
2208 }
|
|
2209 # the start position of reads mapping to the reverse strand is being adjusted further below
|
|
2210 }
|
|
2211 elsif ($strand eq '-') {
|
|
2212
|
0
|
2213 ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start
|
|
2214 my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions
|
3
|
2215 my $I_count_2 = 0;
|
|
2216
|
|
2217 for (1..$ignore_r2) {
|
0
|
2218 my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start
|
|
2219 # print "$_ deleted $op\n";
|
|
2220
|
|
2221 while ($op eq 'D') { # repeating this for deletions (D)
|
|
2222 $D_count_2++;
|
|
2223 $op = shift @comp_cigar_2;
|
|
2224 # print "$_ deleted $op\n";
|
|
2225 }
|
|
2226 if ($op eq 'I') { # adjusting the genomic position for insertions (I)
|
|
2227 $I_count_2++;
|
|
2228 }
|
|
2229 }
|
|
2230
|
3
|
2231 $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2;
|
|
2232 # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n";
|
|
2233 }
|
|
2234 }
|
0
|
2235
|
3
|
2236 if ($ignore){
|
0
|
2237 ### reconstituting shortened CIGAR string 1
|
|
2238 my $new_cigar_1;
|
|
2239 my $count_1 = 0;
|
|
2240 my $last_op_1;
|
|
2241 # print "ignore adjusted CIGAR 1: @comp_cigar_1\n";
|
|
2242 foreach my $op (@comp_cigar_1) {
|
|
2243 unless (defined $last_op_1){
|
|
2244 $last_op_1 = $op;
|
|
2245 ++$count_1;
|
|
2246 next;
|
|
2247 }
|
|
2248 if ($last_op_1 eq $op) {
|
|
2249 ++$count_1;
|
|
2250 } else {
|
|
2251 $new_cigar_1 .= "$count_1$last_op_1";
|
|
2252 $last_op_1 = $op;
|
|
2253 $count_1 = 1;
|
|
2254 }
|
|
2255 }
|
|
2256 $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count
|
|
2257 $cigar_1 = $new_cigar_1;
|
|
2258 # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n";
|
3
|
2259 }
|
|
2260
|
|
2261 if ($ignore_r2){
|
0
|
2262
|
|
2263 ### reconstituting shortened CIGAR string 2
|
|
2264 my $new_cigar_2;
|
|
2265 my $count_2 = 0;
|
|
2266 my $last_op_2;
|
|
2267 # print "ignore adjusted CIGAR 2: @comp_cigar_2\n";
|
|
2268 foreach my $op (@comp_cigar_2) {
|
|
2269 unless (defined $last_op_2){
|
|
2270 $last_op_2 = $op;
|
|
2271 ++$count_2;
|
|
2272 next;
|
|
2273 }
|
|
2274 if ($last_op_2 eq $op) {
|
|
2275 ++$count_2;
|
3
|
2276 }
|
|
2277 else {
|
0
|
2278 $new_cigar_2 .= "$count_2$last_op_2";
|
|
2279 $last_op_2 = $op;
|
|
2280 $count_2 = 1;
|
|
2281 }
|
|
2282 }
|
|
2283 $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count
|
|
2284 $cigar_2 = $new_cigar_2;
|
3
|
2285 # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n";
|
0
|
2286 }
|
|
2287
|
3
|
2288 ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on
|
|
2289
|
0
|
2290 if ($strand eq '+') {
|
|
2291 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2
|
|
2292 @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
|
|
2293 # print "reverse: @comp_cigar_2\n";
|
|
2294
|
|
2295 my $MD_count_1 = 0;
|
|
2296 foreach (@comp_cigar_1) {
|
|
2297 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
|
|
2298 }
|
|
2299
|
|
2300 my $MD_count_2 = 0;
|
|
2301 foreach (@comp_cigar_2) {
|
|
2302 ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
|
|
2303 }
|
|
2304
|
|
2305 $end_read_1 = $start_read_1 + $MD_count_1 - 1;
|
|
2306 $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand
|
3
|
2307 }
|
|
2308 else {
|
0
|
2309 ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1
|
|
2310
|
|
2311 @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
|
|
2312 # print "reverse: @comp_cigar_1\n";
|
|
2313
|
|
2314 my $MD_count_1 = 0;
|
|
2315 foreach (@comp_cigar_1) {
|
|
2316 ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
|
|
2317 }
|
|
2318
|
|
2319 $end_read_1 = $start_read_1;
|
|
2320 $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand
|
|
2321 }
|
|
2322
|
|
2323 if ($strand eq '+') {
|
3
|
2324 ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity
|
|
2325 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1);
|
0
|
2326
|
|
2327 # we next pass the second read which is in - orientation on the reverse strand
|
|
2328 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
|
3
|
2329 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2);
|
0
|
2330 } else {
|
|
2331 ## we first pass the first read which is in - orientation on the reverse strand
|
3
|
2332 print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1);
|
0
|
2333
|
|
2334 # we next pass the second read which is in + orientation on the forward strand
|
|
2335 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
|
3
|
2336 print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2);
|
0
|
2337 }
|
|
2338
|
|
2339 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
|
|
2340 }
|
|
2341 }
|
|
2342 }
|
|
2343 } else {
|
|
2344 die "Single-end or paired-end reads not specified properly\n";
|
|
2345 }
|
|
2346
|
3
|
2347 warn "\n\nProcessed $line_count lines from $filename in total\n";
|
|
2348 warn "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
|
0
|
2349 if ($report) {
|
3
|
2350 print REPORT "\n\nProcessed $line_count lines from $filename in total\n";
|
0
|
2351 print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
|
|
2352 }
|
|
2353 print_splitting_report ();
|
|
2354 }
|
|
2355
|
|
2356
|
|
2357
|
|
2358 sub print_splitting_report{
|
|
2359
|
|
2360 ### Calculating methylation percentages if applicable
|
|
2361
|
|
2362 my $percent_meCpG;
|
|
2363 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
|
|
2364 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
|
|
2365 }
|
|
2366
|
|
2367 my $percent_meCHG;
|
|
2368 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
|
|
2369 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
|
|
2370 }
|
|
2371
|
|
2372 my $percent_meCHH;
|
|
2373 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
|
|
2374 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
|
|
2375 }
|
|
2376
|
|
2377 my $percent_non_CpG_methylation;
|
|
2378 if ($merge_non_CpG){
|
|
2379 if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
|
|
2380 $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) );
|
|
2381 }
|
|
2382 }
|
|
2383
|
|
2384 if ($report){
|
|
2385 ### detailed information about Cs analysed
|
|
2386 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
|
|
2387
|
|
2388 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
|
|
2389 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
|
|
2390
|
|
2391 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
|
|
2392 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
|
|
2393 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
|
|
2394
|
|
2395 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
|
|
2396 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
|
|
2397 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
|
|
2398
|
|
2399 ### calculating methylated CpG percentage if applicable
|
|
2400 if ($percent_meCpG){
|
|
2401 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
|
|
2402 }
|
|
2403 else{
|
|
2404 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
|
|
2405 }
|
|
2406
|
|
2407 ### 2-Context Output
|
|
2408 if ($merge_non_CpG){
|
|
2409 if ($percent_non_CpG_methylation){
|
|
2410 print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
|
|
2411 }
|
|
2412 else{
|
|
2413 print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
|
|
2414 }
|
|
2415 }
|
|
2416
|
|
2417 ### 3 Context Output
|
|
2418 else{
|
|
2419 ### calculating methylated CHG percentage if applicable
|
|
2420 if ($percent_meCHG){
|
|
2421 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
|
|
2422 }
|
|
2423 else{
|
|
2424 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
|
|
2425 }
|
|
2426
|
|
2427 ### calculating methylated CHH percentage if applicable
|
|
2428 if ($percent_meCHH){
|
|
2429 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
|
|
2430 }
|
|
2431 else{
|
|
2432 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
|
|
2433 }
|
|
2434 }
|
|
2435 }
|
|
2436
|
|
2437 ### detailed information about Cs analysed for on-screen report
|
|
2438 print "Final Cytosine Methylation Report\n",'='x33,"\n";
|
|
2439
|
|
2440 my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
|
|
2441 print "Total number of C's analysed:\t$total_number_of_C\n\n";
|
|
2442
|
|
2443 print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
|
|
2444 print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
|
|
2445 print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
|
|
2446
|
|
2447 print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
|
|
2448 print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
|
|
2449 print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
|
|
2450
|
|
2451 ### printing methylated CpG percentage if applicable
|
|
2452 if ($percent_meCpG){
|
|
2453 print "C methylated in CpG context:\t${percent_meCpG}%\n";
|
|
2454 }
|
|
2455 else{
|
|
2456 print "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
|
|
2457 }
|
|
2458
|
|
2459 ### 2-Context Output
|
|
2460 if ($merge_non_CpG){
|
|
2461 if ($percent_non_CpG_methylation){
|
|
2462 print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n";
|
|
2463 }
|
|
2464 else{
|
|
2465 print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n";
|
|
2466 }
|
|
2467 }
|
|
2468
|
|
2469 ### 3-Context Output
|
|
2470 else{
|
|
2471 ### printing methylated CHG percentage if applicable
|
|
2472 if ($percent_meCHG){
|
|
2473 print "C methylated in CHG context:\t${percent_meCHG}%\n";
|
|
2474 }
|
|
2475 else{
|
|
2476 print "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
|
|
2477 }
|
|
2478
|
|
2479 ### printing methylated CHH percentage if applicable
|
|
2480 if ($percent_meCHH){
|
|
2481 print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
|
|
2482 }
|
|
2483 else{
|
|
2484 print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
|
|
2485 }
|
|
2486 }
|
|
2487 }
|
|
2488
|
|
2489
|
|
2490
|
|
2491 sub print_individual_C_methylation_states_paired_end_files{
|
|
2492
|
3
|
2493 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_;
|
|
2494
|
|
2495 ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2
|
|
2496 die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2);
|
|
2497
|
0
|
2498 my @methylation_calls = split(//,$meth_call);
|
|
2499
|
|
2500 #################################################################
|
|
2501 ### . for bases not involving cytosines ###
|
|
2502 ### X for methylated C in CHG context (was protected) ###
|
|
2503 ### x for not methylated C in CHG context (was converted) ###
|
|
2504 ### H for methylated C in CHH context (was protected) ###
|
|
2505 ### h for not methylated C in CHH context (was converted) ###
|
|
2506 ### Z for methylated C in CpG context (was protected) ###
|
|
2507 ### z for not methylated C in CpG context (was converted) ###
|
3
|
2508 ### U for methylated C in Unknown context (was protected) ###
|
|
2509 ### u for not methylated C in Unknown context (was converted) ###
|
0
|
2510 #################################################################
|
|
2511
|
|
2512 my $methyl_CHG_count = 0;
|
|
2513 my $methyl_CHH_count = 0;
|
|
2514 my $methyl_CpG_count = 0;
|
|
2515 my $unmethylated_CHG_count = 0;
|
|
2516 my $unmethylated_CHH_count = 0;
|
|
2517 my $unmethylated_CpG_count = 0;
|
|
2518
|
|
2519 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
|
|
2520 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
|
|
2521 my @comp_cigar;
|
|
2522
|
|
2523 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
|
|
2524 if ($cigar =~ /^\d+M$/){
|
3
|
2525 # this check speeds up the extraction process by up to 60%!!!
|
0
|
2526 }
|
|
2527 else{ # parsing CIGAR string
|
|
2528 my @len;
|
|
2529 my @ops;
|
|
2530 @len = split (/\D+/,$cigar); # storing the length per operation
|
|
2531 @ops = split (/\d+/,$cigar); # storing the operation
|
|
2532 shift @ops; # remove the empty first element
|
3
|
2533
|
0
|
2534 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
|
2535
|
|
2536 foreach my $index (0..$#len){
|
|
2537 foreach (1..$len[$index]){
|
|
2538 # print "$ops[$index]";
|
|
2539 push @comp_cigar, $ops[$index];
|
|
2540 }
|
|
2541 }
|
|
2542 # warn "\nDetected CIGAR string: $cigar\n";
|
|
2543 # warn "Length of methylation call: ",length $meth_call,"\n";
|
|
2544 # warn "number of operations: ",scalar @ops,"\n";
|
|
2545 # warn "number of length digits: ",scalar @len,"\n\n";
|
|
2546 # print @comp_cigar,"\n";
|
|
2547 # print "$meth_call\n\n";
|
|
2548 # sleep (1);
|
|
2549 }
|
|
2550
|
|
2551 if ($strand eq '-') {
|
|
2552
|
|
2553 ### the CIGAR string needs to be reversed, the methylation call has already been reversed above
|
|
2554 if (@comp_cigar){
|
|
2555 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
|
|
2556 }
|
|
2557 # print "reverse CIGAR string: @comp_cigar\n";
|
|
2558
|
|
2559 ### the start position of paired-end files has already been corrected, see above
|
|
2560 }
|
|
2561
|
|
2562 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified
|
|
2563
|
|
2564 if ($merge_non_CpG) {
|
3
|
2565 if ($no_overlap) { # this has to be read 2...
|
0
|
2566
|
|
2567 ### single-file CpG and non-CpG context output
|
|
2568 if ($full) {
|
|
2569 if ($strand eq '+') {
|
|
2570 for my $index (0..$#methylation_calls) {
|
|
2571
|
|
2572 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2573 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2574 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2575 $cigar_offset += $cigar_mod;
|
|
2576 $pos_offset += $pos_mod;
|
|
2577 }
|
|
2578
|
|
2579 ### Returning as soon as the methylation calls start overlapping
|
|
2580 if ($start+$index+$pos_offset >= $end_read_1) {
|
|
2581 return;
|
|
2582 }
|
|
2583
|
|
2584 if ($methylation_calls[$index] eq 'X') {
|
|
2585 $counting{total_meCHG_count}++;
|
3
|
2586 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2587 if ($read_identity == 1){
|
|
2588 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
2589 }
|
|
2590 else{
|
|
2591 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
2592 }
|
|
2593 }
|
|
2594 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
2595 $counting{total_unmethylated_CHG_count}++;
|
3
|
2596 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2597 if ($read_identity == 1){
|
|
2598 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
2599 }
|
|
2600 else{
|
|
2601 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
2602 }
|
|
2603 }
|
|
2604 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
2605 $counting{total_meCpG_count}++;
|
3
|
2606 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2607 if ($read_identity == 1){
|
|
2608 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
2609 }
|
|
2610 else{
|
|
2611 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
2612 }
|
|
2613 }
|
|
2614 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
2615 $counting{total_unmethylated_CpG_count}++;
|
3
|
2616 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2617 if ($read_identity == 1){
|
|
2618 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
2619 }
|
|
2620 else{
|
|
2621 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
2622 }
|
|
2623 }
|
|
2624 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
2625 $counting{total_meCHH_count}++;
|
3
|
2626 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2627 if ($read_identity == 1){
|
|
2628 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
2629 }
|
|
2630 else{
|
|
2631 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
2632 }
|
|
2633 }
|
|
2634 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
2635 $counting{total_unmethylated_CHH_count}++;
|
3
|
2636 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2637 if ($read_identity == 1){
|
|
2638 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
2639 }
|
|
2640 else{
|
|
2641 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
2642 }
|
0
|
2643 }
|
|
2644 elsif ($methylation_calls[$index] eq '.'){}
|
3
|
2645 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
2646 else{
|
3
|
2647 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
|
0
|
2648 }
|
|
2649 }
|
3
|
2650 }
|
|
2651 elsif ($strand eq '-') {
|
0
|
2652 for my $index (0..$#methylation_calls) {
|
|
2653
|
|
2654 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2655 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2656 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2657 $cigar_offset += $cigar_mod;
|
|
2658 $pos_offset += $pos_mod;
|
|
2659 }
|
|
2660
|
|
2661 ### Returning as soon as the methylation calls start overlapping
|
|
2662 if ($start-$index+$pos_offset <= $end_read_1) {
|
|
2663 return;
|
|
2664 }
|
3
|
2665
|
0
|
2666 if ($methylation_calls[$index] eq 'X') {
|
|
2667 $counting{total_meCHG_count}++;
|
3
|
2668 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2669 if ($read_identity == 1){
|
|
2670 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
2671 }
|
|
2672 else{
|
|
2673 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
2674 }
|
|
2675 }
|
|
2676 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
2677 $counting{total_unmethylated_CHG_count}++;
|
3
|
2678 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2679 if ($read_identity == 1){
|
|
2680 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
2681 }
|
|
2682 else{
|
|
2683 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
2684 }
|
|
2685 }
|
|
2686 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
2687 $counting{total_meCpG_count}++;
|
3
|
2688 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2689 if ($read_identity == 1){
|
|
2690 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
2691 }
|
|
2692 else{
|
|
2693 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
2694 }
|
|
2695 }
|
|
2696 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
2697 $counting{total_unmethylated_CpG_count}++;
|
3
|
2698 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2699 if ($read_identity == 1){
|
|
2700 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
2701 }
|
|
2702 else{
|
|
2703 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
2704 }
|
|
2705 }
|
|
2706 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
2707 $counting{total_meCHH_count}++;
|
3
|
2708 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2709 if ($read_identity == 1){
|
|
2710 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
2711 }
|
|
2712 else{
|
|
2713 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
2714 }
|
|
2715 }
|
|
2716 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
2717 $counting{total_unmethylated_CHH_count}++;
|
3
|
2718 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2719 if ($read_identity == 1){
|
|
2720 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
2721 }
|
|
2722 else{
|
|
2723 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
2724 }
|
0
|
2725 }
|
|
2726 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
2727 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
2728 else{
|
3
|
2729 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
|
0
|
2730 }
|
|
2731 }
|
|
2732 } else {
|
|
2733 die "The read orientation was neither + nor -: '$strand'\n";
|
|
2734 }
|
|
2735 }
|
|
2736
|
|
2737 ### strand-specific methylation output
|
|
2738 else {
|
|
2739 if ($strand eq '+') {
|
|
2740 for my $index (0..$#methylation_calls) {
|
|
2741
|
|
2742 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2743 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2744 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2745 $cigar_offset += $cigar_mod;
|
|
2746 $pos_offset += $pos_mod;
|
|
2747 }
|
|
2748
|
|
2749 ### Returning as soon as the methylation calls start overlapping
|
|
2750 if ($start+$index+$pos_offset >= $end_read_1) {
|
|
2751 return;
|
|
2752 }
|
|
2753
|
|
2754 if ($methylation_calls[$index] eq 'X') {
|
|
2755 $counting{total_meCHG_count}++;
|
3
|
2756 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2757 if ($read_identity == 1){
|
|
2758 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
2759 }
|
|
2760 else{
|
|
2761 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
2762 }
|
|
2763 }
|
|
2764 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
2765 $counting{total_unmethylated_CHG_count}++;
|
3
|
2766 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2767 if ($read_identity == 1){
|
|
2768 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
2769 }
|
|
2770 else{
|
|
2771 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
2772 }
|
|
2773 }
|
|
2774 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
2775 $counting{total_meCpG_count}++;
|
3
|
2776 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2777 if ($read_identity == 1){
|
|
2778 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
2779 }
|
|
2780 else{
|
|
2781 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
2782 }
|
|
2783 }
|
|
2784 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
2785 $counting{total_unmethylated_CpG_count}++;
|
3
|
2786 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2787 if ($read_identity == 1){
|
|
2788 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
2789 }
|
|
2790 else{
|
|
2791 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
2792 }
|
|
2793 }
|
|
2794 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
2795 $counting{total_meCHH_count}++;
|
3
|
2796 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2797 if ($read_identity == 1){
|
|
2798 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
2799 }
|
|
2800 else{
|
|
2801 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
2802 }
|
|
2803 }
|
|
2804 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
2805 $counting{total_unmethylated_CHH_count}++;
|
3
|
2806 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2807 if ($read_identity == 1){
|
|
2808 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
2809 }
|
|
2810 else{
|
|
2811 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
2812 }
|
0
|
2813 }
|
|
2814 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
2815 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
2816 else{
|
|
2817 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2818 }
|
|
2819 }
|
|
2820 } elsif ($strand eq '-') {
|
|
2821 for my $index (0..$#methylation_calls) {
|
|
2822
|
|
2823 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2824 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2825 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2826 $cigar_offset += $cigar_mod;
|
|
2827 $pos_offset += $pos_mod;
|
|
2828 }
|
|
2829
|
|
2830 ### Returning as soon as the methylation calls start overlapping
|
|
2831 if ($start-$index+$pos_offset <= $end_read_1) {
|
|
2832 return;
|
|
2833 }
|
|
2834
|
|
2835 if ($methylation_calls[$index] eq 'X') {
|
|
2836 $counting{total_meCHG_count}++;
|
3
|
2837 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2838 if ($read_identity == 1){
|
|
2839 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
2840 }
|
|
2841 else{
|
|
2842 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
2843 }
|
|
2844 }
|
|
2845 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
2846 $counting{total_unmethylated_CHG_count}++;
|
3
|
2847 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2848 if ($read_identity == 1){
|
|
2849 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
2850 }
|
|
2851 else{
|
|
2852 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
2853 }
|
|
2854 }
|
|
2855 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
2856 $counting{total_meCpG_count}++;
|
3
|
2857 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2858 if ($read_identity == 1){
|
|
2859 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
2860 }
|
|
2861 else{
|
|
2862 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
2863 }
|
|
2864 }
|
|
2865 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
2866 $counting{total_unmethylated_CpG_count}++;
|
3
|
2867 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2868 if ($read_identity == 1){
|
|
2869 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
2870 }
|
|
2871 else{
|
|
2872 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
2873 }
|
|
2874 }
|
|
2875 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
2876 $counting{total_meCHH_count}++;
|
3
|
2877 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2878 if ($read_identity == 1){
|
|
2879 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
2880 }
|
|
2881 else{
|
|
2882 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
2883 }
|
|
2884 }
|
|
2885 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
2886 $counting{total_unmethylated_CHH_count}++;
|
3
|
2887 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2888 if ($read_identity == 1){
|
|
2889 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
2890 }
|
|
2891 else{
|
|
2892 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
2893 }
|
0
|
2894 }
|
|
2895 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
2896 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
2897 else{
|
|
2898 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
2899 }
|
|
2900 }
|
|
2901 } else {
|
|
2902 die "The strand orientation was neither + nor -: '$strand'/n";
|
|
2903 }
|
|
2904 }
|
|
2905 }
|
|
2906
|
|
2907 ### this is the default paired-end procedure allowing overlaps and using every single C position
|
|
2908 ### Still within the 2-CONTEXT ONLY optional section
|
|
2909 else {
|
|
2910 ### single-file CpG and non-CpG context output
|
|
2911 if ($full) {
|
|
2912 if ($strand eq '+') {
|
|
2913 for my $index (0..$#methylation_calls) {
|
|
2914
|
|
2915 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2916 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2917 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
2918 $cigar_offset += $cigar_mod;
|
|
2919 $pos_offset += $pos_mod;
|
|
2920 }
|
|
2921
|
|
2922 if ($methylation_calls[$index] eq 'X') {
|
|
2923 $counting{total_meCHG_count}++;
|
3
|
2924 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2925 if ($read_identity == 1){
|
|
2926 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
2927 }
|
|
2928 else{
|
|
2929 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
2930 }
|
|
2931 }
|
|
2932 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
2933 $counting{total_unmethylated_CHG_count}++;
|
3
|
2934 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2935 if ($read_identity == 1){
|
|
2936 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
2937 }
|
|
2938 else{
|
|
2939 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
2940 }
|
|
2941 }
|
|
2942 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
2943 $counting{total_meCpG_count}++;
|
3
|
2944 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2945 if ($read_identity == 1){
|
|
2946 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
2947 }
|
|
2948 else{
|
|
2949 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
2950 }
|
|
2951 }
|
|
2952 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
2953 $counting{total_unmethylated_CpG_count}++;
|
3
|
2954 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2955 if ($read_identity == 1){
|
|
2956 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
2957 }
|
|
2958 else{
|
|
2959 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
2960 }
|
|
2961 }
|
|
2962 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
2963 $counting{total_meCHH_count}++;
|
3
|
2964 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2965 if ($read_identity == 1){
|
|
2966 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
2967 }
|
|
2968 else{
|
|
2969 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
2970 }
|
|
2971 }
|
|
2972 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
2973 $counting{total_unmethylated_CHH_count}++;
|
3
|
2974 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
2975 if ($read_identity == 1){
|
|
2976 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
2977 }
|
|
2978 else{
|
|
2979 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
2980 }
|
0
|
2981 }
|
|
2982 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
2983 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
2984 else{
|
3
|
2985 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
|
0
|
2986 }
|
|
2987 }
|
|
2988 } elsif ($strand eq '-') {
|
|
2989 for my $index (0..$#methylation_calls) {
|
|
2990
|
|
2991 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
2992 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
2993 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
2994 $cigar_offset += $cigar_mod;
|
|
2995 $pos_offset += $pos_mod;
|
|
2996 }
|
|
2997
|
|
2998 if ($methylation_calls[$index] eq 'X') {
|
|
2999 $counting{total_meCHG_count}++;
|
3
|
3000 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3001 if ($read_identity == 1){
|
|
3002 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
3003 }
|
|
3004 else{
|
|
3005 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
3006 }
|
|
3007 }
|
|
3008 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
3009 $counting{total_unmethylated_CHG_count}++;
|
3
|
3010 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3011 if ($read_identity == 1){
|
|
3012 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
3013 }
|
|
3014 else{
|
|
3015 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
3016 }
|
|
3017 }
|
|
3018 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
3019 $counting{total_meCpG_count}++;
|
3
|
3020 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3021 if ($read_identity == 1){
|
|
3022 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
3023 }
|
|
3024 else{
|
|
3025 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
3026 }
|
|
3027 }
|
|
3028 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
3029 $counting{total_unmethylated_CpG_count}++;
|
3
|
3030 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3031 if ($read_identity == 1){
|
|
3032 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
3033 }
|
|
3034 else{
|
|
3035 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
3036 }
|
|
3037 }
|
|
3038 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
3039 $counting{total_meCHH_count}++;
|
3
|
3040 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3041 if ($read_identity == 1){
|
|
3042 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
3043 }
|
|
3044 else{
|
|
3045 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
3046 }
|
|
3047 }
|
|
3048 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
3049 $counting{total_unmethylated_CHH_count}++;
|
3
|
3050 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3051 if ($read_identity == 1){
|
|
3052 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
3053 }
|
|
3054 else{
|
|
3055 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
3056 }
|
0
|
3057 }
|
|
3058 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
3059 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
3060 else{
|
3
|
3061 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
|
0
|
3062 }
|
|
3063 }
|
|
3064 } else {
|
|
3065 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
3066 }
|
|
3067 }
|
|
3068
|
|
3069 ### strand-specific methylation output
|
|
3070 ### still within the 2-CONTEXT optional section
|
|
3071 else {
|
|
3072 if ($strand eq '+') {
|
|
3073 for my $index (0..$#methylation_calls) {
|
|
3074
|
|
3075 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
3076 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3077 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
3078 $cigar_offset += $cigar_mod;
|
|
3079 $pos_offset += $pos_mod;
|
|
3080 }
|
|
3081
|
|
3082 if ($methylation_calls[$index] eq 'X') {
|
|
3083 $counting{total_meCHG_count}++;
|
3
|
3084 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3085 if ($read_identity == 1){
|
|
3086 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
3087 }
|
|
3088 else{
|
|
3089 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
3090 }
|
|
3091 }
|
|
3092 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
3093 $counting{total_unmethylated_CHG_count}++;
|
3
|
3094 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3095 if ($read_identity == 1){
|
|
3096 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
3097 }
|
|
3098 else{
|
|
3099 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
3100 }
|
|
3101 }
|
|
3102 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
3103 $counting{total_meCpG_count}++;
|
3
|
3104 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3105 if ($read_identity == 1){
|
|
3106 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
3107 }
|
|
3108 else{
|
|
3109 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
3110 }
|
|
3111 }
|
|
3112 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
3113 $counting{total_unmethylated_CpG_count}++;
|
3
|
3114 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3115 if ($read_identity == 1){
|
|
3116 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
3117 }
|
|
3118 else{
|
|
3119 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
3120 }
|
|
3121 }
|
|
3122 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
3123 $counting{total_meCHH_count}++;
|
3
|
3124 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3125 if ($read_identity == 1){
|
|
3126 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
3127 }
|
|
3128 else{
|
|
3129 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
3130 }
|
|
3131 }
|
|
3132 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
3133 $counting{total_unmethylated_CHH_count}++;
|
3
|
3134 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3135 if ($read_identity == 1){
|
|
3136 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
3137 }
|
|
3138 else{
|
|
3139 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
3140 }
|
0
|
3141 }
|
|
3142 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
3143 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
3144 else{
|
|
3145 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3146 }
|
|
3147 }
|
|
3148 } elsif ($strand eq '-') {
|
|
3149 for my $index (0..$#methylation_calls) {
|
|
3150
|
|
3151 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
3152 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
3153 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3154 $cigar_offset += $cigar_mod;
|
|
3155 $pos_offset += $pos_mod;
|
|
3156 }
|
|
3157
|
|
3158 if ($methylation_calls[$index] eq 'X') {
|
|
3159 $counting{total_meCHG_count}++;
|
3
|
3160 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3161 if ($read_identity == 1){
|
|
3162 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
3163 }
|
|
3164 else{
|
|
3165 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
3166 }
|
|
3167 }
|
|
3168 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
3169 $counting{total_unmethylated_CHG_count}++;
|
3
|
3170 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3171 if ($read_identity == 1){
|
|
3172 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
3173 }
|
|
3174 else{
|
|
3175 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
3176 }
|
|
3177 }
|
|
3178 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
3179 $counting{total_meCpG_count}++;
|
3
|
3180 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3181 if ($read_identity == 1){
|
|
3182 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
3183 }
|
|
3184 else{
|
|
3185 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
3186 }
|
|
3187 }
|
|
3188 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
3189 $counting{total_unmethylated_CpG_count}++;
|
3
|
3190 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3191 if ($read_identity == 1){
|
|
3192 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
3193 }
|
|
3194 else{
|
|
3195 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
3196 }
|
|
3197 }
|
|
3198 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
3199 $counting{total_meCHH_count}++;
|
3
|
3200 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3201 if ($read_identity == 1){
|
|
3202 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
3203 }
|
|
3204 else{
|
|
3205 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
3206 }
|
|
3207 }
|
|
3208 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
3209 $counting{total_unmethylated_CHH_count}++;
|
3
|
3210 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3211 if ($read_identity == 1){
|
|
3212 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
3213 }
|
|
3214 else{
|
|
3215 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
3216 }
|
0
|
3217 }
|
|
3218 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
3219 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
3220 else{
|
|
3221 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3222 }
|
|
3223 }
|
|
3224 } else {
|
|
3225 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
3226 }
|
|
3227 }
|
|
3228 }
|
|
3229 }
|
|
3230
|
|
3231 ############################################
|
|
3232 ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ###
|
|
3233 ############################################
|
|
3234
|
|
3235 elsif ($no_overlap) {
|
|
3236 ### single-file CpG, CHG and CHH context output
|
|
3237 if ($full) {
|
|
3238 if ($strand eq '+') {
|
|
3239 for my $index (0..$#methylation_calls) {
|
|
3240
|
|
3241 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
3242 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3243 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
3244 $cigar_offset += $cigar_mod;
|
|
3245 $pos_offset += $pos_mod;
|
|
3246 }
|
|
3247
|
|
3248 ### Returning as soon as the methylation calls start overlapping
|
|
3249 if ($start+$index+$pos_offset >= $end_read_1) {
|
|
3250 return;
|
|
3251 }
|
|
3252
|
|
3253 if ($methylation_calls[$index] eq 'X') {
|
|
3254 $counting{total_meCHG_count}++;
|
3
|
3255 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3256 if ($read_identity == 1){
|
|
3257 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
3258 }
|
|
3259 else{
|
|
3260 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
3261 }
|
|
3262 }
|
|
3263 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
3264 $counting{total_unmethylated_CHG_count}++;
|
3
|
3265 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3266 if ($read_identity == 1){
|
|
3267 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
3268 }
|
|
3269 else{
|
|
3270 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
3271 }
|
|
3272 }
|
|
3273 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
3274 $counting{total_meCpG_count}++;
|
3
|
3275 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3276 if ($read_identity == 1){
|
|
3277 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
3278 }
|
|
3279 else{
|
|
3280 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
3281 }
|
|
3282 }
|
|
3283 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
3284 $counting{total_unmethylated_CpG_count}++;
|
3
|
3285 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3286 if ($read_identity == 1){
|
|
3287 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
3288 }
|
|
3289 else{
|
|
3290 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
3291 }
|
|
3292 }
|
|
3293 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
3294 $counting{total_meCHH_count}++;
|
3
|
3295 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3296 if ($read_identity == 1){
|
|
3297 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
3298 }
|
|
3299 else{
|
|
3300 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
3301 }
|
|
3302 }
|
|
3303 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
3304 $counting{total_unmethylated_CHH_count}++;
|
3
|
3305 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3306 if ($read_identity == 1){
|
|
3307 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
3308 }
|
|
3309 else{
|
|
3310 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
3311 }
|
0
|
3312 }
|
|
3313 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
3314 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
3315 else{
|
|
3316 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3317 }
|
|
3318 }
|
|
3319 } elsif ($strand eq '-') {
|
|
3320 for my $index (0..$#methylation_calls) {
|
|
3321
|
|
3322 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
3323 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
3324 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3325 $cigar_offset += $cigar_mod;
|
|
3326 $pos_offset += $pos_mod;
|
|
3327 }
|
|
3328
|
|
3329 ### Returning as soon as the methylation calls start overlapping
|
|
3330 if ($start-$index+$pos_offset <= $end_read_1) {
|
|
3331 return;
|
|
3332 }
|
|
3333
|
|
3334 if ($methylation_calls[$index] eq 'X') {
|
|
3335 $counting{total_meCHG_count}++;
|
3
|
3336 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3337 if ($read_identity == 1){
|
|
3338 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
3339 }
|
|
3340 else{
|
|
3341 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
3342 }
|
|
3343 }
|
|
3344 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
3345 $counting{total_unmethylated_CHG_count}++;
|
3
|
3346 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3347 if ($read_identity == 1){
|
|
3348 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
3349 }
|
|
3350 else{
|
|
3351 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
3352 }
|
|
3353 }
|
|
3354 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
3355 $counting{total_meCpG_count}++;
|
3
|
3356 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3357 if ($read_identity == 1){
|
|
3358 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
3359 }
|
|
3360 else{
|
|
3361 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
3362 }
|
|
3363 }
|
|
3364 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
3365 $counting{total_unmethylated_CpG_count}++;
|
3
|
3366 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3367 if ($read_identity == 1){
|
|
3368 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
3369 }
|
|
3370 else{
|
|
3371 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
3372 }
|
|
3373 }
|
|
3374 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
3375 $counting{total_meCHH_count}++;
|
3
|
3376 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3377 if ($read_identity == 1){
|
|
3378 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
3379 }
|
|
3380 else{
|
|
3381 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
3382 }
|
|
3383 }
|
|
3384 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
3385 $counting{total_unmethylated_CHH_count}++;
|
3
|
3386 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3387 if ($read_identity == 1){
|
|
3388 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
3389 }
|
|
3390 else{
|
|
3391 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
3392 }
|
0
|
3393 }
|
|
3394 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
3395 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
3396 else{
|
|
3397 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3398 }
|
|
3399 }
|
|
3400 } else {
|
|
3401 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
3402 }
|
|
3403 }
|
|
3404
|
|
3405 ### strand-specific methylation output
|
|
3406 else {
|
|
3407 if ($strand eq '+') {
|
|
3408 for my $index (0..$#methylation_calls) {
|
|
3409
|
|
3410 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
3411 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3412 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
3413 $cigar_offset += $cigar_mod;
|
|
3414 $pos_offset += $pos_mod;
|
|
3415 }
|
|
3416
|
|
3417 ### Returning as soon as the methylation calls start overlapping
|
|
3418 if ($start+$index+$pos_offset >= $end_read_1) {
|
|
3419 return;
|
|
3420 }
|
|
3421
|
|
3422 if ($methylation_calls[$index] eq 'X') {
|
|
3423 $counting{total_meCHG_count}++;
|
3
|
3424 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3425 if ($read_identity == 1){
|
|
3426 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
3427 }
|
|
3428 else{
|
|
3429 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
3430 }
|
|
3431 }
|
|
3432 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
3433 $counting{total_unmethylated_CHG_count}++;
|
3
|
3434 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3435 if ($read_identity == 1){
|
|
3436 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
3437 }
|
|
3438 else{
|
|
3439 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
3440 }
|
|
3441 }
|
|
3442 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
3443 $counting{total_meCpG_count}++;
|
3
|
3444 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3445 if ($read_identity == 1){
|
|
3446 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
3447 }
|
|
3448 else{
|
|
3449 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
3450 }
|
|
3451 }
|
|
3452 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
3453 $counting{total_unmethylated_CpG_count}++;
|
3
|
3454 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3455 if ($read_identity == 1){
|
|
3456 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
3457 }
|
|
3458 else{
|
|
3459 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
3460 }
|
|
3461 }
|
|
3462 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
3463 $counting{total_meCHH_count}++;
|
3
|
3464 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3465 if ($read_identity == 1){
|
|
3466 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
3467 }
|
|
3468 else{
|
|
3469 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
3470 }
|
|
3471 }
|
|
3472 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
3473 $counting{total_unmethylated_CHH_count}++;
|
3
|
3474 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3475 if ($read_identity == 1){
|
|
3476 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
3477 }
|
|
3478 else{
|
|
3479 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
3480 }
|
0
|
3481 }
|
|
3482 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
3483 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
3484 else{
|
|
3485 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3486 }
|
|
3487 }
|
|
3488 } elsif ($strand eq '-') {
|
|
3489 for my $index (0..$#methylation_calls) {
|
|
3490
|
|
3491 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
3492 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
3493 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3494 $cigar_offset += $cigar_mod;
|
|
3495 $pos_offset += $pos_mod;
|
|
3496 }
|
|
3497
|
|
3498 ### Returning as soon as the methylation calls start overlapping
|
|
3499 if ($start-$index+$pos_offset <= $end_read_1) {
|
|
3500 return;
|
|
3501 }
|
|
3502
|
|
3503 if ($methylation_calls[$index] eq 'X') {
|
|
3504 $counting{total_meCHG_count}++;
|
3
|
3505 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3506 if ($read_identity == 1){
|
|
3507 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
3508 }
|
|
3509 else{
|
|
3510 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
3511 }
|
|
3512 }
|
|
3513 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
3514 $counting{total_unmethylated_CHG_count}++;
|
3
|
3515 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3516 if ($read_identity == 1){
|
|
3517 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
3518 }
|
|
3519 else{
|
|
3520 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
3521 }
|
|
3522 }
|
|
3523 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
3524 $counting{total_meCpG_count}++;
|
3
|
3525 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3526 if ($read_identity == 1){
|
|
3527 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
3528 }
|
|
3529 else{
|
|
3530 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
3531 }
|
|
3532 }
|
|
3533 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
3534 $counting{total_unmethylated_CpG_count}++;
|
3
|
3535 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3536 if ($read_identity == 1){
|
|
3537 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
3538 }
|
|
3539 else{
|
|
3540 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
3541 }
|
|
3542 }
|
|
3543 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
3544 $counting{total_meCHH_count}++;
|
3
|
3545 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3546 if ($read_identity == 1){
|
|
3547 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
3548 }
|
|
3549 else{
|
|
3550 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
3551 }
|
|
3552 }
|
|
3553 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
3554 $counting{total_unmethylated_CHH_count}++;
|
3
|
3555 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3556 if ($read_identity == 1){
|
|
3557 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
3558 }
|
|
3559 else{
|
|
3560 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
3561 }
|
0
|
3562 }
|
|
3563 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
3564 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
3565 else{
|
|
3566 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3567 }
|
|
3568 }
|
|
3569 } else {
|
|
3570 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
3571 }
|
|
3572 }
|
|
3573 }
|
|
3574
|
|
3575 ### this is the default paired-end procedure allowing overlaps and using every single C position
|
|
3576 else {
|
|
3577 ### single-file CpG, CHG and CHH context output
|
|
3578 if ($full) {
|
|
3579 if ($strand eq '+') {
|
|
3580 for my $index (0..$#methylation_calls) {
|
|
3581
|
|
3582 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
3583 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3584 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
3585 $cigar_offset += $cigar_mod;
|
|
3586 $pos_offset += $pos_mod;
|
|
3587 }
|
|
3588
|
|
3589 if ($methylation_calls[$index] eq 'X') {
|
|
3590 $counting{total_meCHG_count}++;
|
3
|
3591 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3592 if ($read_identity == 1){
|
|
3593 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
3594 }
|
|
3595 else{
|
|
3596 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
3597 }
|
|
3598 }
|
|
3599 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
3600 $counting{total_unmethylated_CHG_count}++;
|
3
|
3601 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3602 if ($read_identity == 1){
|
|
3603 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
3604 }
|
|
3605 else{
|
|
3606 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
3607 }
|
|
3608 }
|
|
3609 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
3610 $counting{total_meCpG_count}++;
|
3
|
3611 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3612 if ($read_identity == 1){
|
|
3613 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
3614 }
|
|
3615 else{
|
|
3616 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
3617 }
|
|
3618 }
|
|
3619 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
3620 $counting{total_unmethylated_CpG_count}++;
|
3
|
3621 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3622 if ($read_identity == 1){
|
|
3623 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
3624 }
|
|
3625 else{
|
|
3626 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
3627 }
|
|
3628 }
|
|
3629 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
3630 $counting{total_meCHH_count}++;
|
3
|
3631 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3632 if ($read_identity == 1){
|
|
3633 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
3634 }
|
|
3635 else{
|
|
3636 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
3637 }
|
|
3638 }
|
|
3639 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
3640 $counting{total_unmethylated_CHH_count}++;
|
3
|
3641 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3642 if ($read_identity == 1){
|
|
3643 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
3644 }
|
|
3645 else{
|
|
3646 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
3647 }
|
0
|
3648 }
|
|
3649 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
3650 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
3651 else{
|
|
3652 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3653 }
|
|
3654 }
|
|
3655 } elsif ($strand eq '-') {
|
|
3656 for my $index (0..$#methylation_calls) {
|
|
3657
|
|
3658 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
3659 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
3660 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3661 $cigar_offset += $cigar_mod;
|
|
3662 $pos_offset += $pos_mod;
|
|
3663 }
|
|
3664
|
|
3665 if ($methylation_calls[$index] eq 'X') {
|
|
3666 $counting{total_meCHG_count}++;
|
3
|
3667 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3668 if ($read_identity == 1){
|
|
3669 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
3670 }
|
|
3671 else{
|
|
3672 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
3673 }
|
|
3674 }
|
|
3675 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
3676 $counting{total_unmethylated_CHG_count}++;
|
3
|
3677 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3678 if ($read_identity == 1){
|
|
3679 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
3680 }
|
|
3681 else{
|
|
3682 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
3683 }
|
|
3684 }
|
|
3685 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
3686 $counting{total_meCpG_count}++;
|
3
|
3687 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3688 if ($read_identity == 1){
|
|
3689 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
3690 }
|
|
3691 else{
|
|
3692 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
3693 }
|
|
3694 }
|
|
3695 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
3696 $counting{total_unmethylated_CpG_count}++;
|
3
|
3697 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3698 if ($read_identity == 1){
|
|
3699 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
3700 }
|
|
3701 else{
|
|
3702 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
3703 }
|
|
3704 }
|
|
3705 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
3706 $counting{total_meCHH_count}++;
|
3
|
3707 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3708 if ($read_identity == 1){
|
|
3709 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
3710 }
|
|
3711 else{
|
|
3712 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
3713 }
|
|
3714 }
|
|
3715 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
3716 $counting{total_unmethylated_CHH_count}++;
|
3
|
3717 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3718 if ($read_identity == 1){
|
|
3719 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
3720 }
|
|
3721 else{
|
|
3722 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
3723 }
|
0
|
3724 }
|
|
3725 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
3726 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
3727 else{
|
|
3728 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3729 }
|
|
3730 }
|
|
3731 } else {
|
|
3732 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
3733 }
|
|
3734 }
|
|
3735
|
|
3736 ### strand-specific methylation output
|
|
3737 else {
|
|
3738 if ($strand eq '+') {
|
|
3739 for my $index (0..$#methylation_calls) {
|
|
3740
|
|
3741 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
3742 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3743 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
3744 $cigar_offset += $cigar_mod;
|
|
3745 $pos_offset += $pos_mod;
|
|
3746 }
|
|
3747
|
|
3748 if ($methylation_calls[$index] eq 'X') {
|
|
3749 $counting{total_meCHG_count}++;
|
3
|
3750 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3751 if ($read_identity == 1){
|
|
3752 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
3753 }
|
|
3754 else{
|
|
3755 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
3756 }
|
|
3757 }
|
|
3758 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
3759 $counting{total_unmethylated_CHG_count}++;
|
3
|
3760 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3761 if ($read_identity == 1){
|
|
3762 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
3763 }
|
|
3764 else{
|
|
3765 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
3766 }
|
|
3767 }
|
|
3768 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
3769 $counting{total_meCpG_count}++;
|
3
|
3770 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3771 if ($read_identity == 1){
|
|
3772 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
3773 }
|
|
3774 else{
|
|
3775 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
3776 }
|
|
3777 }
|
|
3778 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
3779 $counting{total_unmethylated_CpG_count}++;
|
3
|
3780 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3781 if ($read_identity == 1){
|
|
3782 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
3783 }
|
|
3784 else{
|
|
3785 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
3786 }
|
|
3787 }
|
|
3788 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
3789 $counting{total_meCHH_count}++;
|
3
|
3790 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3791 if ($read_identity == 1){
|
|
3792 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
3793 }
|
|
3794 else{
|
|
3795 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
3796 }
|
|
3797 }
|
|
3798 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
3799 $counting{total_unmethylated_CHH_count}++;
|
3
|
3800 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3801 if ($read_identity == 1){
|
|
3802 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
3803 }
|
|
3804 else{
|
|
3805 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
3806 }
|
0
|
3807 }
|
|
3808 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
3809 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
3810 else{
|
|
3811 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3812 }
|
|
3813 }
|
|
3814 } elsif ($strand eq '-') {
|
|
3815 for my $index (0..$#methylation_calls) {
|
|
3816
|
|
3817 if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels
|
|
3818 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
3819 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
3820 $cigar_offset += $cigar_mod;
|
|
3821 $pos_offset += $pos_mod;
|
|
3822 }
|
|
3823
|
|
3824 if ($methylation_calls[$index] eq 'X') {
|
|
3825 $counting{total_meCHG_count}++;
|
3
|
3826 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3827 if ($read_identity == 1){
|
|
3828 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
3829 }
|
|
3830 else{
|
|
3831 $mbias_2{CHG}->{$index+1}->{meth}++;
|
|
3832 }
|
|
3833 }
|
|
3834 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
3835 $counting{total_unmethylated_CHG_count}++;
|
3
|
3836 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3837 if ($read_identity == 1){
|
|
3838 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
3839 }
|
|
3840 else{
|
|
3841 $mbias_2{CHG}->{$index+1}->{un}++;
|
|
3842 }
|
|
3843 }
|
|
3844 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
3845 $counting{total_meCpG_count}++;
|
3
|
3846 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3847 if ($read_identity == 1){
|
|
3848 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
3849 }
|
|
3850 else{
|
|
3851 $mbias_2{CpG}->{$index+1}->{meth}++;
|
|
3852 }
|
|
3853 }
|
|
3854 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
3855 $counting{total_unmethylated_CpG_count}++;
|
3
|
3856 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3857 if ($read_identity == 1){
|
|
3858 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
3859 }
|
|
3860 else{
|
|
3861 $mbias_2{CpG}->{$index+1}->{un}++;
|
|
3862 }
|
|
3863 }
|
|
3864 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
3865 $counting{total_meCHH_count}++;
|
3
|
3866 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3867 if ($read_identity == 1){
|
|
3868 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
3869 }
|
|
3870 else{
|
|
3871 $mbias_2{CHH}->{$index+1}->{meth}++;
|
|
3872 }
|
|
3873 }
|
|
3874 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
3875 $counting{total_unmethylated_CHH_count}++;
|
3
|
3876 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
3877 if ($read_identity == 1){
|
|
3878 $mbias_1{CHH}->{$index+1}->{un}++;
|
|
3879 }
|
|
3880 else{
|
|
3881 $mbias_2{CHH}->{$index+1}->{un}++;
|
|
3882 }
|
0
|
3883 }
|
|
3884 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
3885 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
3886 else{
|
|
3887 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
3888 }
|
|
3889 }
|
|
3890 } else {
|
|
3891 die "The strand orientation as neither + nor -: '$strand'\n";
|
|
3892 }
|
|
3893 }
|
|
3894 }
|
|
3895 }
|
|
3896
|
|
3897 sub check_cigar_string {
|
|
3898 my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_;
|
|
3899 # print "$index\t$cigar_offset\t$pos_offset\t$strand\t";
|
|
3900 my ($new_cigar_offset,$new_pos_offset) = (0,0);
|
|
3901
|
|
3902 if ($strand eq '+') {
|
|
3903 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
|
|
3904
|
|
3905 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
|
|
3906 # warn "position needs no adjustment\n";
|
|
3907 }
|
|
3908
|
|
3909 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
|
|
3910 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
|
|
3911 # warn "adjusted genomic position by -1 bp (insertion)\n";
|
|
3912 }
|
|
3913
|
|
3914 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
|
|
3915 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
|
|
3916 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
|
|
3917 # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
|
|
3918
|
|
3919 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
|
|
3920 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
|
|
3921 # warn "position needs no adjustment\n";
|
|
3922 last;
|
|
3923 }
|
|
3924 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
|
|
3925 $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position
|
|
3926 # warn "adjusted genomic position by another -1 bp (insertion)\n";
|
|
3927 last;
|
|
3928 }
|
|
3929 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
|
|
3930 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
|
|
3931 $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position
|
|
3932 # warn "adjusted genomic position by another +1 bp (deletion)\n";
|
|
3933 }
|
|
3934 else{
|
|
3935 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
|
|
3936 }
|
|
3937 }
|
|
3938 }
|
|
3939 else{
|
|
3940 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
|
|
3941 }
|
|
3942 }
|
|
3943
|
|
3944 elsif ($strand eq '-') {
|
|
3945 # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t";
|
|
3946
|
|
3947 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
|
|
3948 # warn "position needs no adjustment\n";
|
|
3949 }
|
|
3950
|
|
3951 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence
|
|
3952 $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position
|
|
3953 # warn "adjusted genomic position by +1 bp (insertion)\n";
|
|
3954 }
|
|
3955
|
|
3956 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
|
|
3957 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
|
|
3958 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
|
|
3959 # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n";
|
|
3960
|
|
3961 while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){
|
|
3962 if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position
|
|
3963 # warn "Found new 'M' operation; position needs no adjustment\n";
|
|
3964 last;
|
|
3965 }
|
|
3966 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){
|
|
3967 $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position
|
|
3968 # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n";
|
|
3969 last;
|
|
3970 }
|
|
3971 elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence
|
|
3972 $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index
|
|
3973 $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position
|
|
3974 # warn "adjusted genomic position by another -1 bp (deletion)\n";
|
|
3975 }
|
|
3976 else{
|
|
3977 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
|
|
3978 }
|
|
3979 }
|
|
3980 }
|
|
3981 else{
|
|
3982 die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n";
|
|
3983 }
|
|
3984 }
|
|
3985 # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n";
|
|
3986 return ($new_cigar_offset,$new_pos_offset);
|
|
3987 }
|
|
3988
|
|
3989 sub print_individual_C_methylation_states_single_end{
|
|
3990
|
|
3991 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_;
|
|
3992 my @methylation_calls = split(//,$meth_call);
|
|
3993
|
|
3994 #################################################################
|
|
3995 ### . for bases not involving cytosines ###
|
|
3996 ### X for methylated C in CHG context (was protected) ###
|
|
3997 ### x for not methylated C in CHG context (was converted) ###
|
|
3998 ### H for methylated C in CHH context (was protected) ###
|
|
3999 ### h for not methylated C in CHH context (was converted) ###
|
|
4000 ### Z for methylated C in CpG context (was protected) ###
|
|
4001 ### z for not methylated C in CpG context (was converted) ###
|
|
4002 #################################################################
|
|
4003
|
|
4004 my $methyl_CHG_count = 0;
|
|
4005 my $methyl_CHH_count = 0;
|
|
4006 my $methyl_CpG_count = 0;
|
|
4007 my $unmethylated_CHG_count = 0;
|
|
4008 my $unmethylated_CHH_count = 0;
|
|
4009 my $unmethylated_CpG_count = 0;
|
|
4010
|
|
4011 my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions
|
|
4012 my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels
|
|
4013
|
|
4014 my @comp_cigar;
|
|
4015
|
|
4016 if ($cigar){ # parsing CIGAR string
|
|
4017
|
|
4018 ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing
|
|
4019 if ($cigar =~ /^\d+M$/){
|
|
4020 # warn "See!? I told you so! $cigar\n";
|
|
4021 # sleep(1);
|
|
4022 }
|
|
4023 else{
|
|
4024
|
|
4025 my @len;
|
|
4026 my @ops;
|
|
4027
|
|
4028 @len = split (/\D+/,$cigar); # storing the length per operation
|
|
4029 @ops = split (/\d+/,$cigar); # storing the operation
|
|
4030 shift @ops; # remove the empty first element
|
3
|
4031 # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops);
|
0
|
4032 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
3
|
4033
|
0
|
4034 foreach my $index (0..$#len){
|
|
4035 foreach (1..$len[$index]){
|
|
4036 # print "$ops[$index]";
|
|
4037 push @comp_cigar, $ops[$index];
|
|
4038 }
|
|
4039 }
|
|
4040 }
|
|
4041 # warn "\nDetected CIGAR string: $cigar\n";
|
|
4042 # warn "Length of methylation call: ",length $meth_call,"\n";
|
|
4043 # warn "number of operations: ",scalar @ops,"\n";
|
|
4044 # warn "number of length digits: ",scalar @len,"\n\n";
|
|
4045 # print @comp_cigar,"\n";
|
|
4046 # print "$meth_call\n\n";
|
|
4047 # sleep (1);
|
|
4048 }
|
3
|
4049
|
0
|
4050 ### adjusting the start position for all reads mapping to the reverse strand
|
|
4051 if ($strand eq '-') {
|
|
4052
|
|
4053 if (@comp_cigar){ # only needed for SAM reads with InDels
|
|
4054 @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too
|
|
4055 # print @comp_cigar,"\n";
|
|
4056 }
|
|
4057
|
|
4058 unless ($ignore){ ### if --ignore was specified the start position has already been corrected
|
3
|
4059
|
0
|
4060 if ($cigar){ ### SAM format
|
|
4061 if ($cigar =~ /^(\d+)M$/){ # linear match
|
|
4062 $start += $1 - 1;
|
|
4063 }
|
|
4064 else{ # InDel read
|
|
4065 my $MD_count = 0;
|
|
4066 foreach (@comp_cigar){
|
|
4067 ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't
|
|
4068 }
|
|
4069 $start += $MD_count - 1;
|
|
4070 }
|
|
4071 }
|
|
4072 else{ ### vanilla format
|
|
4073 $start += length($meth_call)-1;
|
|
4074 }
|
|
4075 }
|
|
4076 }
|
|
4077
|
|
4078 ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL)
|
|
4079
|
|
4080 ### single-file CpG and other-context output
|
|
4081 if ($full and $merge_non_CpG) {
|
|
4082 if ($strand eq '+') {
|
|
4083 for my $index (0..$#methylation_calls) {
|
|
4084
|
|
4085 if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels
|
|
4086 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
4087 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t";
|
|
4088 $cigar_offset += $cigar_mod;
|
|
4089 $pos_offset += $pos_mod;
|
|
4090 }
|
|
4091
|
|
4092 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
4093 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
4094 if ($methylation_calls[$index] eq 'X') {
|
|
4095 $counting{total_meCHG_count}++;
|
3
|
4096 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4097 $mbias_1{CHG}->{$index+1}->{meth}++;
|
0
|
4098 }
|
|
4099 elsif ($methylation_calls[$index] eq 'x') {
|
|
4100 $counting{total_unmethylated_CHG_count}++;
|
3
|
4101 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4102 $mbias_1{CHG}->{$index+1}->{un}++;
|
0
|
4103 }
|
|
4104 elsif ($methylation_calls[$index] eq 'Z') {
|
|
4105 $counting{total_meCpG_count}++;
|
3
|
4106 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4107 $mbias_1{CpG}->{$index+1}->{meth}++;
|
0
|
4108 }
|
|
4109 elsif ($methylation_calls[$index] eq 'z') {
|
|
4110 $counting{total_unmethylated_CpG_count}++;
|
3
|
4111 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4112 $mbias_1{CpG}->{$index+1}->{un}++;
|
0
|
4113 }
|
|
4114 elsif ($methylation_calls[$index] eq 'H') {
|
|
4115 $counting{total_meCHH_count}++;
|
3
|
4116 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4117 $mbias_1{CHH}->{$index+1}->{meth}++;
|
0
|
4118 }
|
|
4119 elsif ($methylation_calls[$index] eq 'h') {
|
|
4120 $counting{total_unmethylated_CHH_count}++;
|
3
|
4121 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4122 $mbias_1{CHH}->{$index+1}->{un}++;
|
0
|
4123 }
|
3
|
4124 elsif ($methylation_calls[$index] eq '.') {}
|
|
4125 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
4126 else{
|
|
4127 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
4128 }
|
|
4129 }
|
|
4130 }
|
|
4131 elsif ($strand eq '-') {
|
|
4132
|
|
4133 for my $index (0..$#methylation_calls) {
|
|
4134 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
4135 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
4136
|
|
4137 if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels
|
|
4138 # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t";
|
|
4139 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
4140 $cigar_offset += $cigar_mod;
|
|
4141 $pos_offset += $pos_mod;
|
|
4142 }
|
|
4143
|
|
4144 if ($methylation_calls[$index] eq 'X') {
|
|
4145 $counting{total_meCHG_count}++;
|
3
|
4146 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4147 $mbias_1{CHG}->{$index+1}->{meth}++;
|
0
|
4148 }
|
|
4149 elsif ($methylation_calls[$index] eq 'x') {
|
|
4150 $counting{total_unmethylated_CHG_count}++;
|
3
|
4151 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4152 $mbias_1{CHG}->{$index+1}->{un}++;
|
0
|
4153 }
|
|
4154 elsif ($methylation_calls[$index] eq 'Z') {
|
|
4155 $counting{total_meCpG_count}++;
|
3
|
4156 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4157 $mbias_1{CpG}->{$index+1}->{meth}++;
|
0
|
4158 }
|
|
4159 elsif ($methylation_calls[$index] eq 'z') {
|
|
4160 $counting{total_unmethylated_CpG_count}++;
|
3
|
4161 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4162 $mbias_1{CpG}->{$index+1}->{un}++;
|
0
|
4163 }
|
|
4164 elsif ($methylation_calls[$index] eq 'H') {
|
|
4165 $counting{total_meCHH_count}++;
|
3
|
4166 print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4167 $mbias_1{CHH}->{$index+1}->{meth}++;
|
0
|
4168 }
|
|
4169 elsif ($methylation_calls[$index] eq 'h') {
|
|
4170 $counting{total_unmethylated_CHH_count}++;
|
3
|
4171 print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4172 $mbias_1{CHH}->{$index+1}->{un}++;
|
0
|
4173 }
|
3
|
4174 elsif ($methylation_calls[$index] eq '.'){}
|
|
4175 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
4176 else{
|
|
4177 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
4178 }
|
|
4179 }
|
|
4180 }
|
|
4181 else {
|
|
4182 die "The strand information was neither + nor -: $strand\n";
|
|
4183 }
|
|
4184 }
|
|
4185
|
|
4186 ### strand-specific methylation output
|
|
4187 elsif ($merge_non_CpG) {
|
|
4188 if ($strand eq '+') {
|
|
4189 for my $index (0..$#methylation_calls) {
|
|
4190 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
4191 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
4192
|
|
4193 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
|
|
4194 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
4195 $cigar_offset += $cigar_mod;
|
|
4196 $pos_offset += $pos_mod;
|
|
4197 }
|
|
4198
|
|
4199 if ($methylation_calls[$index] eq 'X') {
|
|
4200 $counting{total_meCHG_count}++;
|
3
|
4201 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4202 $mbias_1{CHG}->{$index+1}->{meth}++;
|
0
|
4203 }
|
|
4204 elsif ($methylation_calls[$index] eq 'x') {
|
|
4205 $counting{total_unmethylated_CHG_count}++;
|
3
|
4206 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4207 $mbias_1{CHG}->{$index+1}->{un}++;
|
0
|
4208 }
|
|
4209 elsif ($methylation_calls[$index] eq 'Z') {
|
|
4210 $counting{total_meCpG_count}++;
|
3
|
4211 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4212 $mbias_1{CpG}->{$index+1}->{meth}++;
|
0
|
4213 }
|
|
4214 elsif ($methylation_calls[$index] eq 'z') {
|
|
4215 $counting{total_unmethylated_CpG_count}++;
|
3
|
4216 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4217 $mbias_1{CpG}->{$index+1}->{un}++;
|
0
|
4218 }
|
|
4219 elsif ($methylation_calls[$index] eq 'H') {
|
|
4220 $counting{total_meCHH_count}++;
|
3
|
4221 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4222 $mbias_1{CHH}->{$index+1}->{meth}++;
|
0
|
4223 }
|
|
4224 elsif ($methylation_calls[$index] eq 'h') {
|
|
4225 $counting{total_unmethylated_CHH_count}++;
|
3
|
4226 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4227 $mbias_1{CHH}->{$index+1}->{un}++;
|
0
|
4228 }
|
3
|
4229 elsif ($methylation_calls[$index] eq '.') {}
|
|
4230 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
4231 else{
|
|
4232 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
4233 }
|
|
4234 }
|
|
4235 }
|
|
4236 elsif ($strand eq '-') {
|
|
4237
|
|
4238 for my $index (0..$#methylation_calls) {
|
|
4239 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
4240 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
4241
|
|
4242 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
|
|
4243 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
4244 $cigar_offset += $cigar_mod;
|
|
4245 $pos_offset += $pos_mod;
|
|
4246 }
|
|
4247
|
|
4248 if ($methylation_calls[$index] eq 'X') {
|
|
4249 $counting{total_meCHG_count}++;
|
3
|
4250 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4251 $mbias_1{CHG}->{$index+1}->{meth}++;
|
0
|
4252 }
|
|
4253 elsif ($methylation_calls[$index] eq 'x') {
|
|
4254 $counting{total_unmethylated_CHG_count}++;
|
3
|
4255 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4256 $mbias_1{CHG}->{$index+1}->{un}++;
|
0
|
4257 }
|
|
4258 elsif ($methylation_calls[$index] eq 'Z') {
|
|
4259 $counting{total_meCpG_count}++;
|
3
|
4260 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4261 $mbias_1{CpG}->{$index+1}->{meth}++;
|
0
|
4262 }
|
|
4263 elsif ($methylation_calls[$index] eq 'z') {
|
|
4264 $counting{total_unmethylated_CpG_count}++;
|
3
|
4265 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4266 $mbias_1{CpG}->{$index+1}->{un}++;
|
0
|
4267 }
|
|
4268 elsif ($methylation_calls[$index] eq 'H') {
|
|
4269 $counting{total_meCHH_count}++;
|
3
|
4270 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4271 $mbias_1{CHH}->{$index+1}->{meth}++;
|
0
|
4272 }
|
|
4273 elsif ($methylation_calls[$index] eq 'h') {
|
|
4274 $counting{total_unmethylated_CHH_count}++;
|
3
|
4275 print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4276 $mbias_1{CHH}->{$index+1}->{un}++;
|
0
|
4277 }
|
3
|
4278 elsif ($methylation_calls[$index] eq '.') {}
|
|
4279 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
4280 else{
|
|
4281 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
4282 }
|
|
4283 }
|
|
4284 }
|
|
4285 else {
|
|
4286 die "The strand information was neither + nor -: $strand\n";
|
|
4287 }
|
|
4288 }
|
|
4289
|
|
4290 ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION
|
|
4291
|
|
4292 elsif ($full) {
|
|
4293 if ($strand eq '+') {
|
|
4294 for my $index (0..$#methylation_calls) {
|
|
4295 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
4296 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
4297
|
|
4298 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
|
|
4299 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
4300 $cigar_offset += $cigar_mod;
|
|
4301 $pos_offset += $pos_mod;
|
|
4302 }
|
3
|
4303
|
0
|
4304 if ($methylation_calls[$index] eq 'X') {
|
|
4305 $counting{total_meCHG_count}++;
|
3
|
4306 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4307 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
4308 }
|
|
4309 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
4310 $counting{total_unmethylated_CHG_count}++;
|
3
|
4311 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4312 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
4313 }
|
|
4314 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
4315 $counting{total_meCpG_count}++;
|
3
|
4316 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4317 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
4318 }
|
|
4319 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
4320 $counting{total_unmethylated_CpG_count}++;
|
3
|
4321 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4322 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
4323 }
|
|
4324 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
4325 $counting{total_meCHH_count}++;
|
3
|
4326 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4327 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
4328 }
|
|
4329 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
4330 $counting{total_unmethylated_CHH_count}++;
|
3
|
4331 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4332 $mbias_1{CHH}->{$index+1}->{un}++;
|
0
|
4333 }
|
|
4334 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
4335 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
4336 else{
|
3
|
4337 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only);
|
0
|
4338 }
|
|
4339 }
|
|
4340 }
|
|
4341 elsif ($strand eq '-') {
|
|
4342
|
|
4343 for my $index (0..$#methylation_calls) {
|
|
4344 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
4345 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
4346
|
|
4347 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
|
|
4348 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
4349 $cigar_offset += $cigar_mod;
|
|
4350 $pos_offset += $pos_mod;
|
|
4351 }
|
|
4352
|
|
4353 if ($methylation_calls[$index] eq 'X') {
|
|
4354 $counting{total_meCHG_count}++;
|
3
|
4355 print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4356 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
4357 }
|
|
4358 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
4359 $counting{total_unmethylated_CHG_count}++;
|
3
|
4360 print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4361 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
4362 }
|
|
4363 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
4364 $counting{total_meCpG_count}++;
|
3
|
4365 print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4366 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
4367 }
|
|
4368 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
4369 $counting{total_unmethylated_CpG_count}++;
|
3
|
4370 print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4371 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
4372 }
|
|
4373 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
4374 $counting{total_meCHH_count}++;
|
3
|
4375 print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4376 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
4377 }
|
|
4378 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
4379 $counting{total_unmethylated_CHH_count}++;
|
3
|
4380 print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4381 $mbias_1{CHH}->{$index+1}->{un}++;
|
0
|
4382 }
|
|
4383 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
4384 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
4385 else{
|
|
4386 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
4387 }
|
|
4388 }
|
|
4389 }
|
|
4390 else {
|
|
4391 die "The read had a strand orientation which was neither + nor -: $strand\n";
|
|
4392 }
|
|
4393 }
|
|
4394
|
|
4395 ### strand-specific methylation output
|
|
4396 else {
|
|
4397 if ($strand eq '+') {
|
|
4398 for my $index (0..$#methylation_calls) {
|
|
4399 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
4400 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
4401
|
|
4402 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
|
|
4403 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
4404 $cigar_offset += $cigar_mod;
|
|
4405 $pos_offset += $pos_mod;
|
|
4406 }
|
|
4407
|
|
4408 if ($methylation_calls[$index] eq 'X') {
|
|
4409 $counting{total_meCHG_count}++;
|
3
|
4410 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4411 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
4412 }
|
|
4413 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
4414 $counting{total_unmethylated_CHG_count}++;
|
3
|
4415 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4416 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
4417 }
|
|
4418 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
4419 $counting{total_meCpG_count}++;
|
3
|
4420 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4421 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
4422 }
|
|
4423 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
4424 $counting{total_unmethylated_CpG_count}++;
|
3
|
4425 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4426 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
4427 }
|
|
4428 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
4429 $counting{total_meCHH_count}++;
|
3
|
4430 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4431 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
4432 }
|
|
4433 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
4434 $counting{total_unmethylated_CHH_count}++;
|
3
|
4435 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4436 $mbias_1{CHH}->{$index+1}->{un}++;
|
0
|
4437 }
|
|
4438 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
4439 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
4440 else{
|
|
4441 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
4442 }
|
|
4443 }
|
|
4444 }
|
|
4445 elsif ($strand eq '-') {
|
|
4446
|
|
4447 for my $index (0..$#methylation_calls) {
|
|
4448 ### methylated Cs (any context) will receive a forward (+) orientation
|
|
4449 ### not methylated Cs (any context) will receive a reverse (-) orientation
|
|
4450
|
|
4451 if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels
|
|
4452 my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar);
|
|
4453 $cigar_offset += $cigar_mod;
|
|
4454 $pos_offset += $pos_mod;
|
|
4455 }
|
|
4456
|
|
4457 if ($methylation_calls[$index] eq 'X') {
|
|
4458 $counting{total_meCHG_count}++;
|
3
|
4459 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4460 $mbias_1{CHG}->{$index+1}->{meth}++;
|
|
4461 }
|
|
4462 elsif ($methylation_calls[$index] eq 'x') {
|
0
|
4463 $counting{total_unmethylated_CHG_count}++;
|
3
|
4464 print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4465 $mbias_1{CHG}->{$index+1}->{un}++;
|
|
4466 }
|
|
4467 elsif ($methylation_calls[$index] eq 'Z') {
|
0
|
4468 $counting{total_meCpG_count}++;
|
3
|
4469 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4470 $mbias_1{CpG}->{$index+1}->{meth}++;
|
|
4471 }
|
|
4472 elsif ($methylation_calls[$index] eq 'z') {
|
0
|
4473 $counting{total_unmethylated_CpG_count}++;
|
3
|
4474 print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4475 $mbias_1{CpG}->{$index+1}->{un}++;
|
|
4476 }
|
|
4477 elsif ($methylation_calls[$index] eq 'H') {
|
0
|
4478 $counting{total_meCHH_count}++;
|
3
|
4479 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4480 $mbias_1{CHH}->{$index+1}->{meth}++;
|
|
4481 }
|
|
4482 elsif ($methylation_calls[$index] eq 'h') {
|
0
|
4483 $counting{total_unmethylated_CHH_count}++;
|
3
|
4484 print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only);
|
|
4485 $mbias_1{CHH}->{$index+1}->{un}++;
|
0
|
4486 }
|
|
4487 elsif ($methylation_calls[$index] eq '.') {}
|
3
|
4488 elsif (lc$methylation_calls[$index] eq 'u'){}
|
0
|
4489 else{
|
|
4490 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
|
|
4491 }
|
|
4492 }
|
|
4493 }
|
|
4494 else {
|
|
4495 die "The strand information was neither + nor -: $strand\n";
|
|
4496 }
|
|
4497 }
|
|
4498 }
|
|
4499
|
|
4500
|
|
4501
|
|
4502 sub print_helpfile{
|
|
4503
|
|
4504 print << 'HOW_TO';
|
|
4505
|
|
4506
|
|
4507 DESCRIPTION
|
|
4508
|
|
4509 The following is a brief description of all options to control the Bismark
|
|
4510 methylation extractor. The script reads in a bisulfite read alignment results file
|
|
4511 produced by the Bismark bisulfite mapper and extracts the methylation information
|
|
4512 for individual cytosines. This information is found in the methylation call field
|
|
4513 which can contain the following characters:
|
|
4514
|
3
|
4515 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
4516 ~~~ X for methylated C in CHG context ~~~
|
|
4517 ~~~ x for not methylated C CHG ~~~
|
|
4518 ~~~ H for methylated C in CHH context ~~~
|
|
4519 ~~~ h for not methylated C in CHH context ~~~
|
|
4520 ~~~ Z for methylated C in CpG context ~~~
|
|
4521 ~~~ z for not methylated C in CpG context ~~~
|
|
4522 ~~~ U for methylated C in Unknown context (CN or CHN ~~~
|
|
4523 ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~
|
|
4524 ~~~ . for any bases not involving cytosines ~~~
|
|
4525 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
0
|
4526
|
|
4527 The methylation extractor outputs result files for cytosines in CpG, CHG and CHH
|
|
4528 context (this distinction is actually already made in Bismark itself). As the methylation
|
|
4529 information for every C analysed can produce files which easily have tens or even hundreds of
|
|
4530 millions of lines, file sizes can become very large and more difficult to handle. The C
|
|
4531 methylation info additionally splits cytosine methylation calls up into one of the four possible
|
|
4532 strands a given bisulfite read aligned against:
|
|
4533
|
|
4534 OT original top strand
|
|
4535 CTOT complementary to original top strand
|
|
4536
|
|
4537 OB original bottom strand
|
|
4538 CTOB complementary to original bottom strand
|
|
4539
|
|
4540 Thus, by default twelve individual output files are being generated per input file (unless
|
|
4541 --comprehensive is specified, see below). The output files can be imported into a genome
|
|
4542 viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact
|
|
4543 unless the bisulfite reads were generated preserving directionality it doesn't make any
|
|
4544 sense to look at the data in a strand-specific manner). Strand-specific output files can
|
|
4545 optionally be skipped, in which case only three output files for CpG, CHG or CHH context
|
|
4546 will be generated. For both the strand-specific and comprehensive outputs there is also
|
|
4547 the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context.
|
|
4548
|
|
4549
|
|
4550 The output files are in the following format (tab delimited):
|
|
4551
|
|
4552 <sequence_id> <strand> <chromosome> <position> <methylation call>
|
|
4553
|
|
4554
|
|
4555 USAGE: methylation_extractor [options] <filenames>
|
|
4556
|
|
4557
|
|
4558 ARGUMENTS:
|
3
|
4559 ==========
|
0
|
4560
|
|
4561 <filenames> A space-separated list of Bismark result files in SAM format from
|
|
4562 which methylation information is extracted for every cytosine in
|
|
4563 the reads. For alignment files in the older custom Bismark output
|
|
4564 see option '--vanilla'.
|
|
4565
|
|
4566 OPTIONS:
|
|
4567
|
|
4568 -s/--single-end Input file(s) are Bismark result file(s) generated from single-end
|
|
4569 read data. Specifying either --single-end or --paired-end is
|
|
4570 mandatory.
|
|
4571
|
|
4572 -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end
|
|
4573 read data. Specifying either --paired-end or --single-end is
|
|
4574 mandatory.
|
|
4575
|
|
4576 --vanilla The Bismark result input file(s) are in the old custom Bismark format
|
|
4577 (up to version 0.5.x) and not in SAM format which is the default as
|
|
4578 of Bismark version 0.6.x or higher. Default: OFF.
|
|
4579
|
|
4580 --no_overlap For paired-end reads it is theoretically possible that read_1 and
|
|
4581 read_2 overlap. This option avoids scoring overlapping methylation
|
|
4582 calls twice (only methylation calls of read 1 are used for in the process
|
|
4583 since read 1 has historically higher quality basecalls than read 2).
|
|
4584 Whilst this option removes a bias towards more methylation calls
|
|
4585 in the center of sequenced fragments it may de facto remove a sizable
|
|
4586 proportion of the data. This option is highly recommended for paired-end
|
|
4587 data.
|
|
4588
|
3
|
4589 --ignore <int> Ignore the first <int> bp from the 5' end of Read 1 when processing the
|
0
|
4590 methylation call string. This can remove e.g. a restriction enzyme site
|
3
|
4591 at the start of each read or any other source of bias (e.g. PBAT-Seq data).
|
|
4592
|
|
4593 --ignore_r2 <int> Ignore the first <int> bp from the 5' end of Read 2 of paired-end sequencing
|
|
4594 results only. Since the first couple of bases in Read 2 of BS-Seq experiments
|
|
4595 show a severe bias towards non-methylation as a result of end-repairing
|
|
4596 sonicated fragments with unmethylated cytosines (see M-bias plot), it is
|
|
4597 recommended that the first couple of bp of Read 2 are removed before
|
|
4598 starting downstream analysis. Please see the section on M-bias plots in the
|
|
4599 Bismark User Guide for more details.
|
|
4600
|
|
4601 --comprehensive Specifying this option will merge all four possible strand-specific
|
|
4602 methylation info into context-dependent output files. The default
|
|
4603
|
0
|
4604 contexts are:
|
|
4605 - CpG context
|
|
4606 - CHG context
|
|
4607 - CHH context
|
|
4608
|
|
4609 --merge_non_CpG This will produce two output files (in --comprehensive mode) or eight
|
|
4610 strand-specific output files (default) for Cs in
|
|
4611 - CpG context
|
|
4612 - non-CpG context
|
|
4613
|
|
4614 --report Prints out a short methylation summary as well as the paramaters used to run
|
|
4615 this script.
|
|
4616
|
|
4617 --no_header Suppresses the Bismark version header line in all output files for more convenient
|
|
4618 batch processing.
|
|
4619
|
|
4620 -o/--output DIR Allows specification of a different output directory (absolute or relative
|
|
4621 path). If not specified explicitely, the output will be written to the current directory.
|
|
4622
|
|
4623 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
|
|
4624 explicitly if Samtools is in the PATH already.
|
|
4625
|
|
4626 --gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in
|
|
4627 a GZIP compressed form to save disk space. This option does not work on bedGraph and
|
|
4628 genome-wide cytosine reports as they are 'tiny' anyway.
|
|
4629
|
|
4630 --version Displays version information.
|
|
4631
|
|
4632 -h/--help Displays this help file and exits.
|
|
4633
|
3
|
4634 --mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as
|
|
4635 well as a report (optional) and then quit. Default: OFF.
|
|
4636
|
0
|
4637
|
|
4638
|
|
4639 bedGraph specific options:
|
3
|
4640 ==========================
|
0
|
4641
|
|
4642 --bedGraph After finishing the methylation extraction, the methylation output is written into a
|
|
4643 sorted bedGraph file that reports the position of a given cytosine and its methylation
|
|
4644 state (in %, see details below). The methylation extractor output is temporarily split up into
|
|
4645 temporary files, one per chromosome (written into the current directory or folder
|
|
4646 specified with -o/--output); these temp files are then used for sorting and deleted
|
|
4647 afterwards. By default, only cytosines in CpG context will be sorted. The option
|
|
4648 '--CX_context' may be used to report all cytosines irrespective of sequence context
|
|
4649 (this will take MUCH longer!). The default folder for temporary files during the sorting
|
|
4650 process is the output directory. The bedGraph conversion step is performed by the external
|
|
4651 module 'bismark2bedGraph'; this script needs to reside in the same folder as the
|
|
4652 bismark_methylation_extractor itself.
|
|
4653
|
|
4654
|
|
4655 --cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide
|
|
4656 before its methylation percentage is reported. Default: 1.
|
|
4657
|
|
4658 --remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting.
|
|
4659
|
|
4660
|
|
4661 --CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered
|
|
4662 in the experiment irrespective of its sequence context. This applies to both forward and
|
|
4663 reverse strands. Please be aware that this option may generate large temporary and output files
|
|
4664 and may take a long time to sort (up to many hours). Default: OFF.
|
|
4665 (i.e. Default = CpG context only).
|
|
4666
|
|
4667 --buffer_size <string> This allows you to specify the main memory sort buffer when sorting the methylation information.
|
|
4668 Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
|
|
4669 a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
|
|
4670 (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
|
|
4671 Defaults to 2G.
|
|
4672
|
3
|
4673 --scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of
|
|
4674 scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to
|
|
4675 individual chromosome files. These errors were caused by the operating system's limit
|
|
4676 of the number of filehandle that can be written to at any one time (typically 1024; to
|
|
4677 find out this limit on Linux, type: ulimit -a).
|
|
4678 To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort
|
|
4679 methylation calls into individual chromosome files. Instead, all input files are
|
|
4680 temporarily merged into a single file (unless there is only a single file), and this
|
|
4681 file will then be sorted by both chromosome AND position using the Unix sort command.
|
|
4682 Please be aware that this option might take a looooong time to complete, depending on
|
|
4683 the size of the input files, and the memory you allocate to this process (see --buffer_size).
|
|
4684 Nevertheless, it seems to be working.
|
|
4685
|
|
4686 --ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will
|
|
4687 instead use two arrays to sort methylated and unmethylated calls. This may result in a faster
|
|
4688 sorting process of very large files, but this comes at the cost of a larger memory footprint
|
|
4689 (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB
|
|
4690 of RAM). Due to overheads in creating and looping through these arrays it seems that it will
|
|
4691 actually be *slower* for small files (few million alignments), and we are currently testing at
|
|
4692 which point it is advisable to use this option. Note that --ample_memory is not compatible
|
|
4693 with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with).
|
|
4694
|
|
4695
|
0
|
4696
|
|
4697 Genome-wide cytosine methylation report specific options:
|
3
|
4698 =========================================================
|
0
|
4699
|
|
4700 --cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a
|
|
4701 genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based
|
|
4702 chromosome coordinates (zero-based cords are optional) and reports CpG context only (all
|
|
4703 cytosine context is optional). The output considers all Cs on both forward and reverse strands and
|
|
4704 reports their position, strand, trinucleotide content and methylation state (counts are 0 if not
|
|
4705 covered). The cytsoine report conversion step is performed by the external module
|
|
4706 'bedGraph2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor
|
|
4707 itself.
|
|
4708
|
|
4709 --CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of
|
|
4710 its context. This applies to both forward and reverse strands. Please be aware that this will
|
|
4711 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
|
|
4712 Default: OFF (i.e. Default = CpG context only).
|
|
4713
|
|
4714 --zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF.
|
|
4715
|
|
4716 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
|
|
4717 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
|
|
4718
|
|
4719 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files
|
|
4720 will be named to include the input filename and the chromosome number.
|
|
4721
|
|
4722
|
|
4723
|
|
4724 OUTPUT:
|
|
4725
|
|
4726 The bismark_methylation_extractor output is in the form:
|
|
4727 ========================================================
|
|
4728 <seq-ID> <methylation state*> <chromosome> <start position (= end position)> <methylation call>
|
|
4729
|
|
4730 * Methylated cytosines receive a '+' orientation,
|
|
4731 * Unmethylated cytosines receive a '-' orientation.
|
|
4732
|
|
4733
|
|
4734
|
3
|
4735 The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords):
|
|
4736 =========================================================================================================
|
|
4737
|
|
4738 track type=bedGraph (header line)
|
|
4739
|
0
|
4740 <chromosome> <start position> <end position> <methylation percentage>
|
|
4741
|
3
|
4742
|
|
4743
|
|
4744 The coverage output looks like this (tab-delimited, 1-based genomic coords):
|
|
4745 ============================================================================
|
0
|
4746
|
|
4747 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count non-methylated>
|
|
4748
|
|
4749
|
|
4750
|
|
4751 The genome-wide cytosine methylation output file is tab-delimited in the following format:
|
|
4752 ==========================================================================================
|
|
4753 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context>
|
|
4754
|
|
4755
|
|
4756
|
3
|
4757 This script was last modified on 25 November 2013.
|
0
|
4758
|
|
4759 HOW_TO
|
|
4760 }
|