comparison bismark_methyl_extractor/coverage2cytosine @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
comparison
equal deleted inserted replaced
6:0f8646f22b8d 7:fcadce4d9a06
1 #!/usr/bin/env perl
2 use warnings;
3 use strict;
4 $|++;
5 use Getopt::Long;
6 use Cwd;
7 use Carp;
8
9 ## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk)
10
11 ## This program is free software: you can redistribute it and/or modify
12 ## it under the terms of the GNU General Public License as published by
13 ## the Free Software Foundation, either version 3 of the License, or
14 ## (at your option) any later version.
15
16 ## This program is distributed in the hope that it will be useful,
17 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ## GNU General Public License for more details.
20
21 ## You should have received a copy of the GNU General Public License
22 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
23
24 my %chromosomes; # storing sequence information of all chromosomes/scaffolds
25 my %processed; # keeping a record of which chromosomes have been processed
26 my $coverage2cytosine_version = 'v0.16.3';
27
28 my ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra) = process_commandline();
29
30 warn "Summary of parameters for genome-wide cytosine report:\n";
31 warn '='x78,"\n";
32 warn "Coverage infile:\t\t\t$coverage_infile\n";
33 warn "Output directory:\t\t\t>$output_dir<\n";
34 warn "Parent directory:\t\t\t>$parent_dir<\n";
35 warn "Genome directory:\t\t\t>$genome_folder<\n";
36
37 if ($CX_context){
38 warn "CX context:\t\t\t\tyes\n";
39 }
40 else{
41 warn "CX context:\t\t\t\tno (CpG context only, default)\n";
42 }
43 if ($merge_CpGs){
44 warn "Pooling CpG top/bottom strand evidence:\tyes\n";
45 }
46 if($gc_context){
47 warn "Optional GC context track:\t\tyes\n";
48 }
49 if ($tetra){
50 warn "Tetra/Penta nucleotide context:\t\tyes\n";
51 }
52
53 if ($zero){
54 warn "Genome coordinates used:\t\t0-based (user specified)\n";
55 }
56 else{
57 warn "Genome coordinates used:\t\t1-based (default)\n";
58 }
59
60 if ($gzip){
61 warn "GZIP compression:\t\t\tyes\n";
62 }
63 else{
64 warn "GZIP compression:\t\t\tno\n";
65 }
66
67 if ($split_by_chromosome){
68 warn "Split by chromosome:\t\t\tyes\n\n\n";
69 }
70 else{
71 warn "Split by chromosome:\t\t\tno\n\n\n";
72 }
73 sleep (3);
74
75 read_genome_into_memory();
76 warn "Stored sequence information of ",scalar keys %chromosomes," chromosomes/scaffolds in total\n\n";
77
78 generate_genome_wide_cytosine_report($coverage_infile);
79
80 ### 11 December 2014
81
82 # The following optional section re-reads the genome-wide report and merges methylation evidence of both top and bottom strand
83 # into a single CpG dinucleotide entity. This significantly simplifies downstream processing, e.g. by the bsseq R-/Bioconductor package
84 # which recommends this merging process to increase coverage per CpG and reduce the memory burden for its processing
85
86 if ($merge_CpGs){
87 # we only allow this operation if the report is limited to CpG context, and for a single report for the entire genome for the time being
88 combine_CpGs_to_single_CG_entity($cytosine_out);
89 }
90
91 ### 18 August 2015
92
93 # The following section reprocessed the genome to generate cytosine methylation output in GC context (e.g. when a GpC methylase had been deployed
94 if ($gc_context){
95 generate_GC_context_report($coverage_infile);
96 }
97
98
99 sub combine_CpGs_to_single_CG_entity{
100 my $CpG_report_file = shift;
101 warn "Now merging top and bottom strand CpGs into a single CG dinucleotide entity\n";
102
103 open (IN,$CpG_report_file) or die "Failed to open file $CpG_report_file: $!\n\n";
104 my $pooled_CG = $CpG_report_file;
105 $pooled_CG =~ s/$/.merged_CpG_evidence.cov/;
106 open (OUT,'>',$pooled_CG) or die "Failed to write to file '$pooled_CG': $!\n\n";
107 warn ">>> Writing a new coverage file with top and bottom strand CpG methylation evidence merged to $pooled_CG <<<\n\n";
108 sleep(1);
109
110 while (1){
111 my $line1 = <IN>;
112 my $line2 = <IN>;
113 last unless ($line1 and $line2);
114
115 chomp $line1;
116 chomp $line2;
117
118 my ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1);
119 my ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2);
120 # print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n";
121 # print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n"; sleep(1);
122
123 if ($pos1 < 2){
124 $line1 = $line2;
125 $line2 = <IN>;
126 chomp $line2;
127 ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1);
128 ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2);
129 }
130
131 # a few sanity checks
132 die "The context of the first line was not CG:\t$line1" unless ($context1 eq 'CG');
133 die "The context of the second line was not CG:\t$line2" unless ($context2 eq 'CG');
134
135 unless ($strand1 eq '+' and $strand2 eq'-'){
136 die "The strand of line 1 and line 2 were not + and -:\nline1:\t$line1\nline2:\t$line2\n";
137 }
138 unless ($pos2 eq ($pos1 + 1)){
139 die "The reported position were not spaced 1bp apart:\nline1:\t$line1\nline2:\t$line2\n";
140 }
141 unless($chr1 eq $chr2){
142 die "The chromsome information for line 1 and 2 did not match:\nline1:\t$line1\nline2:\t$line2\n";
143 }
144
145 # This looks alright from what I can tell, lets pool the 2 strands
146
147 my $pooled_m = $m1 + $m2;
148 my $pooled_u = $u1 + $u2;
149
150 # since this is a new coverage file we only write it out if the coverage is at least 1
151 next unless ($pooled_m + $pooled_u) > 0;
152
153 my $pooled_percentage = sprintf ("%.6f",$pooled_m/ ($pooled_m + $pooled_u) *100);
154 # print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n";
155 # print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n";
156 # print join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n";
157 # sleep(1);
158 print OUT join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n";
159 }
160
161 warn "CpG merging complete. Good luck finding DMRs with bsseq!\n\n";
162
163 }
164
165
166 sub process_commandline{
167 my $help;
168 my $output_dir;
169 my $genome_folder;
170 my $zero;
171 my $CpG_only;
172 my $CX_context;
173 my $gzip;
174
175 my $split_by_chromosome;
176 my $cytosine_out;
177 my $parent_dir;
178 my $version;
179 my $merge_CpGs;
180 my $gc_context;
181 my $tetra;
182
183 my $command_line = GetOptions ('help|man' => \$help,
184 'dir=s' => \$output_dir,
185 'g|genome_folder=s' => \$genome_folder,
186 "zero_based" => \$zero,
187 "CX|CX_context" => \$CX_context,
188 "split_by_chromosome" => \$split_by_chromosome,
189 'o|output=s' => \$cytosine_out,
190 'parent_dir=s' => \$parent_dir,
191 'version' => \$version,
192 'merge_CpGs' => \$merge_CpGs,
193 'GC|GC_context' => \$gc_context,
194 'ff|qq' => \$tetra,
195 'gzip' => \$gzip,
196 );
197
198 ### EXIT ON ERROR if there were errors with any of the supplied options
199 unless ($command_line){
200 die "Please respecify command line options\n";
201 }
202
203 ### HELPFILE
204 if ($help){
205 print_helpfile();
206 exit;
207 }
208
209 if ($version){
210 print << "VERSION";
211
212
213 Bismark Methylation Extractor Module -
214 coverage2cytosine
215
216 Bismark Extractor Version: $coverage2cytosine_version
217 Copyright 2010-15 Felix Krueger, Babraham Bioinformatics
218 www.bioinformatics.babraham.ac.uk/projects/bismark/
219
220
221 VERSION
222 exit;
223 }
224
225 ### no files provided
226 unless (@ARGV){
227 warn "You need to provide a Bismark coverage file (with counts methylated/unmethylated cytosines) to create an individual C methylation output. Please respecify!\n";
228 sleep(2);
229
230 print_helpfile();
231 exit;
232 }
233
234 my $coverage_infile = shift @ARGV;
235
236 unless ($parent_dir){
237 $parent_dir = getcwd();
238 }
239 unless ($parent_dir =~ /\/$/){
240 $parent_dir =~ s/$/\//;
241 }
242
243 unless (defined $cytosine_out){
244 die "Please provide the name of the output file using the option -o/--output filename\n";
245 }
246
247 ### OUTPUT DIR PATH
248 if (defined $output_dir){
249 unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it
250 unless ($output_dir =~ /\/$/){
251 $output_dir =~ s/$/\//;
252 }
253 }
254 }
255 else{
256 $output_dir = '';
257 }
258
259 unless ($CX_context){
260 $CX_context = 0;
261 $CpG_only = 1;
262 }
263
264 ### GENOME folder
265 if ($genome_folder){
266 unless ($genome_folder =~/\/$/){
267 $genome_folder =~ s/$/\//;
268 }
269 }
270 else{
271 die "Please specify a genome folder to proceed (full path only)\n";
272 }
273
274 if ($merge_CpGs){
275 if ($CX_context){
276 die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if CpG-context is selected only (lose the option --CX)\n";
277 }
278 if ($split_by_chromosome){
279 die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if a single CpG report is written out (lose the option --split_by_chromosome)\n";
280 }
281 }
282
283 return ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra);
284 }
285
286
287
288 sub generate_genome_wide_cytosine_report {
289
290 warn "="x78,"\n";
291 warn "Methylation information will now be written into a genome-wide cytosine report\n";
292 warn "="x78,"\n\n";
293 sleep (2);
294
295 my $number_processed = 0;
296
297 ### changing to the output directory again
298 unless ($output_dir eq ''){ # default
299 chdir $output_dir or die "Failed to change directory to $output_dir\n";
300 # warn "Changed directory to $output_dir\n";
301 }
302
303 my $in = shift;
304 # infiles handed over by the methylation extractor will be just the filename on their own. The directory should have been handed over with --dir
305 if ($in =~ /gz$/){
306 open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n"; # changed from gunzip -c to gunzip -c 08 04 16
307 }
308 else{
309 open (IN,"$in") or die "Failed to read from file $in: $!\n";
310 }
311
312
313 ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands
314 unless ($split_by_chromosome){ ### writing all output to a single file (default)
315 if ($gzip){
316 unless ($cytosine_out =~ /\.gz$/){
317 $cytosine_out .= '.gz';
318 }
319 open (CYT,"| gzip -c - > $cytosine_out") or die "Failed to write to file $cytosine_out: $!\n";
320 }
321 else{
322 open (CYT,'>',$cytosine_out) or die "Failed to write to file $cytosine_out: $!\n";
323 }
324 warn ">>> Writing genome-wide cytosine report to: $cytosine_out <<<\n\n";
325 sleep (1);
326 }
327
328 my $last_chr;
329 my %chr; # storing reads for one chromosome at a time
330
331 my $count = 0;
332 while (<IN>){
333 chomp;
334 ++$count;
335 my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/);
336
337 # defining the first chromosome
338 unless (defined $last_chr){
339 $last_chr = $chr;
340 ++$number_processed;
341 # warn "Storing all covered cytosine positions for chromosome: $chr\n";
342 }
343
344 ### As of version 0.9.1 the start positions are 1-based!
345 if ($chr eq $last_chr){
346 $chr{$chr}->{$start}->{meth} = $meth;
347 $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
348 }
349 else{
350 warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
351 ++$number_processed;
352
353 if ($split_by_chromosome){ ## writing output to 1 file per chromosome
354 my $chromosome_out = $cytosine_out;
355 if ($chromosome_out =~ /\.txt$/){
356 $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/;
357 }
358 else{
359 $chromosome_out =~ s/$/.chr${last_chr}.txt/;
360 }
361 open (CYT,'>',$chromosome_out) or die $!;
362 # warn "Writing output for $last_chr to $chromosome_out\n";
363 }
364
365 my $tri_nt;
366 my $tetra_nt;
367 my $penta_nt;
368 my $context;
369
370 $processed{$last_chr} = 1;
371 while ( $chromosomes{$last_chr} =~ /([CG])/g){
372
373 $tri_nt = '';
374 $context = '';
375
376 if ($tetra){
377 $tetra_nt = ''; # clearing
378 $penta_nt = '';
379 }
380
381 my $pos = pos$chromosomes{$last_chr};
382
383 my $strand;
384 my $meth = 0;
385 my $nonmeth = 0;
386
387 if ($1 eq 'C'){ # C on forward strand
388 $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
389 if ($tetra){
390 if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){
391 $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4);
392 }
393 else{
394 $tetra_nt = '';
395 }
396 if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){
397 $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5);
398 }
399 else{
400 $penta_nt = '';
401 }
402 }
403 $strand = '+';
404 }
405 elsif ($1 eq 'G'){ # C on reverse strand
406
407 $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
408 $tri_nt = reverse $tri_nt;
409 $tri_nt =~ tr/ACTG/TGAC/;
410
411 if ($tetra){
412 if ( $pos - 4 >= 0 ){
413 $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4);
414 $tetra_nt = reverse $tetra_nt;
415 $tetra_nt =~ tr/ACTG/TGAC/;
416 }
417 else{
418 $tetra_nt = '';
419 }
420 if ( $pos - 5 >= 0 ){
421 $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5);
422 $penta_nt = reverse $penta_nt;
423 $penta_nt =~ tr/ACTG/TGAC/;
424 }
425 else{
426 $penta_nt = '';
427 }
428 }
429 $strand = '-';
430 }
431
432 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
433
434 # if (length$penta_nt < 5){
435 # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1);
436 # }
437
438 if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! (as of v0.9.1)
439 $meth = $chr{$last_chr}->{$pos}->{meth};
440 $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth};
441 }
442
443
444 ### determining cytosine context
445 if ($tri_nt =~ /^CG/){
446 $context = 'CG';
447 }
448 elsif ($tri_nt =~ /^C.{1}G$/){
449 $context = 'CHG';
450 }
451 elsif ($tri_nt =~ /^C.{2}$/){
452 $context = 'CHH';
453 }
454 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
455 warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n";
456 next;
457 }
458
459 if ($CpG_only){
460 if ($tri_nt =~ /^CG/){ # CpG context is the default
461 if ($zero){ # zero based coordinates
462 $pos -= 1;
463 if ($tetra){
464 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
465 }
466 else{
467 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
468 }
469 }
470 else{ # default
471 if ($tetra){
472 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
473 }
474 else{
475 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
476 }
477 }
478 }
479 }
480 else{ ## all cytosines, specified with --CX
481 if ($zero){ # zero based coordinates
482 $pos -= 1;
483 if ($tetra){
484 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
485 }
486 else{
487 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"
488 }
489 }
490 else{ # default
491 if ($tetra){
492 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
493 }
494 else{
495 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
496 }
497 }
498 }
499 }
500
501 %chr = (); # resetting the hash
502
503 # new first entry
504 $last_chr = $chr;
505 $chr{$chr}->{$start}->{meth} = $meth;
506 $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
507 }
508 }
509
510 # Last found chromosome
511 # If there never was a last chromosome then something must have gone wrong with reading the data in
512 unless (defined $last_chr){
513 die "No last chromosome was defined, something must have gone wrong while reading the data in (e.g. specified wrong file path for a gzipped coverage file?). Please check your command!\n\n";
514 }
515
516 warn "Writing cytosine report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
517 $processed{$last_chr} = 1;
518
519 if ($split_by_chromosome){ ## writing output to 1 file per chromosome
520 my $chromosome_out = $cytosine_out;
521 if ($chromosome_out =~ /\.txt$/){ # files passed on by the methylation extractor end in _report.txt
522 $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/;
523 }
524 else{ # user specified output file name
525 $chromosome_out =~ s/$/.chr${last_chr}.txt/;
526 }
527 open (CYT,'>',$chromosome_out) or die $!;
528 # warn "Writing output for $last_chr to $chromosome_out\n";
529 }
530
531 my $tri_nt;
532 my $tetra_nt;
533 my $penta_nt;
534 my $context;
535
536 while ( $chromosomes{$last_chr} =~ /([CG])/g){
537
538 $tri_nt = '';
539 $context = '';
540
541 if ($tetra){
542 $tetra_nt = '';
543 $penta_nt = '';
544 }
545
546 my $pos = pos$chromosomes{$last_chr};
547
548 my $strand;
549 my $meth = 0;
550 my $nonmeth = 0;
551
552 if ($1 eq 'C'){ # C on forward strand
553 $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
554 $strand = '+';
555
556 if ($tetra){
557 if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){
558 $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4);
559 }
560 else{
561 $tetra_nt = '';
562 }
563 if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){
564 $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5);
565 }
566 else{
567 $penta_nt = '';
568 }
569 }
570
571 }
572 elsif ($1 eq 'G'){ # C on reverse strand
573 $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based!
574 $tri_nt = reverse $tri_nt;
575 $tri_nt =~ tr/ACTG/TGAC/;
576 $strand = '-';
577
578 if ($tetra){
579 if ( $pos - 4 >= 0 ){
580 $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4);
581 $tetra_nt = reverse $tetra_nt;
582 $tetra_nt =~ tr/ACTG/TGAC/;
583 }
584 else{
585 $tetra_nt = '';
586 }
587
588 if ( $pos - 5 >= 0 ){
589 $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5);
590 $penta_nt = reverse $penta_nt;
591 $penta_nt =~ tr/ACTG/TGAC/;
592 }
593 else{
594 $penta_nt = '';
595 }
596 }
597 }
598
599 if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! as of v0.9.1
600 $meth = $chr{$last_chr}->{$pos}->{meth};
601 $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth};
602 }
603
604 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
605 # if (length$penta_nt < 5){
606 # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1);
607 # }
608
609 ### determining cytosine context
610 if ($tri_nt =~ /^CG/){
611 $context = 'CG';
612 }
613 elsif ($tri_nt =~ /^C.{1}G$/){
614 $context = 'CHG';
615 }
616 elsif ($tri_nt =~ /^C.{2}$/){
617 $context = 'CHH';
618 }
619 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
620 warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n";
621 next;
622 }
623
624 if ($CpG_only){
625 if ($tri_nt =~ /^CG/){ # CpG context is the default
626 if ($zero){ # zero-based coordinates
627 $pos -= 1;
628 if ($tetra){
629 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
630 }
631 else{
632 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
633 }
634 }
635 else{ # default
636 if ($tetra){
637 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
638 }
639 else{
640 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
641 }
642 }
643 }
644 }
645 else{ ## all cytosines, specified with --CX
646 if ($zero){ # zero based coordinates
647 $pos -= 1;
648 if ($tetra){
649 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
650 }
651 else{
652 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
653 }
654 }
655 else{ # default
656 if ($tetra){
657 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
658 }
659 else{
660 print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
661 }
662 }
663 }
664 }
665
666 warn "Finished writing out cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n";
667
668 ### Now processing chromosomes that were not covered in the coverage file
669 warn "Now processing chromosomes that were not covered by any methylation calls in the coverage file...\n";
670 my $unprocessed = 0;
671
672 foreach my $chr (sort keys %processed) {
673 unless ( $processed{$chr} ) {
674 ++$unprocessed;
675 ++$number_processed;
676 process_unprocessed_chromosomes($chr);
677 }
678 }
679
680 if ($unprocessed == 0) {
681 warn "All chromosomes in the genome were covered by at least some reads. coverage2cytosine processing complete.\n\n";
682 }
683 else{
684 warn "Finished writing out cytosine report (processed $number_processed chromosomes/scaffolds in total). coverage2cytosine processing complete.\n\n";
685 }
686
687 close CYT or warn $!;
688
689 }
690
691
692 #### GC CONTEXT - optional
693 ####
694
695 sub generate_GC_context_report {
696
697 warn "="x82,"\n";
698 warn "Methylation information for GC context will now be written to a GpC-context report\n";
699 warn "="x82,"\n\n";
700 sleep (2);
701
702 my $number_processed = 0;
703
704 ### changing to the output directory again
705 unless ($output_dir eq ''){ # default
706 chdir $output_dir or die "Failed to change directory to $output_dir\n";
707 # warn "Changed directory to $output_dir\n";
708 }
709
710 my $in = shift;
711 if ($in =~ /gz$/){
712 open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n";
713 }
714 else{
715 open (IN,"$in") or die "Failed to read from file $in: $!\n";
716 }
717
718 my $gc_out = $cytosine_out.'.GpC_report.txt';
719 my $gc_cov = $cytosine_out.'.GpC.cov';
720
721 ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands
722 open (GC,'>',$gc_out) or die "Failed to write to GpC file $gc_out: !\n\n";
723 warn ">>> Writing genome-wide GpC cytosine report to: $gc_out <<<\n";
724 open (GCCOV,'>',$gc_cov) or die "Failed to write to GpC coverage file $gc_cov: !\n\n";
725 warn ">>> Writing genome-wide GpC coverage file to: $gc_cov <<<\n\n";
726
727 my $last_chr;
728 my %chr; # storing reads for one chromosome at a time
729
730 my $count = 0;
731 while (<IN>){
732 chomp;
733 ++$count;
734 my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/);
735
736 # defining the first chromosome
737 unless (defined $last_chr){
738 $last_chr = $chr;
739 ++$number_processed;
740 # warn "Storing all covered cytosine positions for chromosome: $chr\n";
741 }
742
743 ### start positions are 1-based
744 if ($chr eq $last_chr){
745 $chr{$chr}->{$start}->{meth} = $meth;
746 $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
747 }
748 else{
749 warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
750 ++$number_processed;
751 $processed{$last_chr} = 1;
752 while ( $chromosomes{$last_chr} =~ /(GC)/g){
753
754 # a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time
755 my $context_top = '';
756 my $context_bottom = '';
757 my $pos = pos$chromosomes{$last_chr};
758
759 my $meth_top = 0;
760 my $meth_bottom = 0;
761 my $nonmeth_top = 0;
762 my $nonmeth_bottom = 0;
763
764 #warn "$1\n"; sleep(1);
765 # C on forward strand
766 my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
767 my $strand_top = '+';
768
769 # C on reverse strand
770 my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3); # positions are 0-based!
771 $tri_nt_bottom = reverse $tri_nt_bottom;
772 $tri_nt_bottom =~ tr/ACTG/TGAC/;
773 my $strand_bottom = '-';
774
775 next if (length $tri_nt_top < 3); # trinucleotide sequence could not be extracted for the top strand
776 next if (length $tri_nt_bottom < 3); # trinucleotide sequence could not be extracted for the reverse strand
777
778 # top strand
779 if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based!
780 $meth_top = $chr{$last_chr}->{$pos}->{meth};
781 $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth};
782 }
783 # bottom strand
784 if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs
785 $meth_bottom = $chr{$last_chr}->{$pos-1}->{meth};
786 $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth};
787 }
788
789 ### determining cytosine context top strand
790 if ($tri_nt_top =~ /^CG/){
791 $context_top = 'CG';
792 }
793 elsif ($tri_nt_top =~ /^C.{1}G$/){
794 $context_top = 'CHG';
795 }
796 elsif ($tri_nt_top =~ /^C.{2}$/){
797 $context_top = 'CHH';
798 }
799 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
800 warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n";
801 next;
802 }
803
804 ### determining cytosine context bottom strand
805 if ($tri_nt_bottom =~ /^CG/){
806 $context_bottom = 'CG';
807 }
808 elsif ($tri_nt_bottom =~ /^C.{1}G$/){
809 $context_bottom = 'CHG';
810 }
811 elsif ($tri_nt_bottom =~ /^C.{2}$/){
812 $context_bottom = 'CHH';
813 }
814 else{ # if the context can't be determined the positions will not be printed
815 warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n";
816 next;
817 }
818
819 # if Cs were not covered at all they are not written out
820 unless ($meth_bottom + $nonmeth_bottom == 0){
821 my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100);
822 print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n";
823 print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n";
824 }
825 unless ($meth_top + $nonmeth_top == 0){
826 my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100);
827 print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n";
828 print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n";
829 }
830
831 }
832
833 %chr = (); # resetting the hash
834
835 # new first entry
836 $last_chr = $chr;
837 $chr{$chr}->{$start}->{meth} = $meth;
838 $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
839 }
840 }
841
842 # Last found chromosome
843 warn "Writing cytosine GpC report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
844 $processed{$last_chr} = 1;
845 while ( $chromosomes{$last_chr} =~ /(GC)/g){
846
847 # a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time
848 my $context_top = '';
849 my $context_bottom = '';
850 my $pos = pos$chromosomes{$last_chr};
851
852 my $meth_top = 0;
853 my $meth_bottom = 0;
854 my $nonmeth_top = 0;
855 my $nonmeth_bottom = 0;
856
857 # C on forward strand
858 my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based!
859 my $strand_top = '+';
860
861 # C on reverse strand
862 my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3); # positions are 0-based!
863 $tri_nt_bottom = reverse $tri_nt_bottom;
864 $tri_nt_bottom =~ tr/ACTG/TGAC/;
865 my $strand_bottom = '-';
866
867 next if (length $tri_nt_top < 3); # trinucleotide sequence could not be extracted for the top strand
868 next if (length $tri_nt_bottom < 3); # trinucleotide sequence could not be extracted for the bottom strand
869
870 ### determining cytosine context top strand
871 if ($tri_nt_top =~ /^CG/){
872 $context_top = 'CG';
873 }
874 elsif ($tri_nt_top =~ /^C.{1}G$/){
875 $context_top = 'CHG';
876 }
877 elsif ($tri_nt_top =~ /^C.{2}$/){
878 $context_top = 'CHH';
879 }
880 else{ # if the context can't be determined the positions will not be printed
881 warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n";
882 next;
883 }
884
885 ### determining cytosine context bottom strand
886 if ($tri_nt_bottom =~ /^CG/){
887 $context_bottom = 'CG';
888 }
889 elsif ($tri_nt_bottom =~ /^C.{1}G$/){
890 $context_bottom = 'CHG';
891 }
892 elsif ($tri_nt_bottom =~ /^C.{2}$/){
893 $context_bottom = 'CHH';
894 }
895 else{ # if the context can't be determined the positions will not be printed
896 warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n";
897 next;
898 }
899
900 # top strand
901 if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based!
902 $meth_top = $chr{$last_chr}->{$pos}->{meth};
903 $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth};
904 }
905 # bottom strand
906 if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs
907 $meth_bottom = $chr{$last_chr}->{$pos-1}->{meth};
908 $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth};
909 }
910
911 # if Cs were not covered at all they are not written out
912 unless ($meth_bottom + $nonmeth_bottom == 0){
913 my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100);
914 print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n";
915 print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n";
916 }
917 unless ($meth_top + $nonmeth_top == 0){
918 my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100);
919 print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n";
920 print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n";
921 }
922 }
923
924 warn "Finished writing out GpC cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n";
925
926 close GC or warn $!;
927
928 }
929
930 ####
931 ####
932
933
934 sub process_unprocessed_chromosomes{
935
936 my $chr = shift;
937
938 warn "Writing cytosine report for not covered chromosome $chr\n";
939 $processed{$chr} = 1;
940
941 if ($split_by_chromosome){ ## writing output to 1 file per chromosome
942 my $chromosome_out = $cytosine_out;
943 if ($chromosome_out =~ /txt$/){ # files passed on by the methylation extractor end in _report.txt
944 $chromosome_out =~ s/txt$/chr${chr}.txt/;
945 }
946 else{ # user specified output file name
947 $chromosome_out =~ s/$/.chr${chr}.txt/;
948 }
949
950 open (CYT,'>',$chromosome_out) or die $!;
951 # warn "Writing output for $last_chr to $chromosome_out\n";
952 }
953
954 my $tri_nt;
955 my $tetra_nt;
956 my $penta_nt;
957 my $context;
958
959 while ( $chromosomes{$chr} =~ /([CG])/g){
960
961 $tri_nt = '';
962 $context = '';
963 if ($tetra){
964 $tetra_nt = ''; # clearing
965 $penta_nt = '';
966 }
967
968 my $pos = pos$chromosomes{$chr};
969
970 my $strand;
971 my $meth = 0;
972 my $nonmeth = 0;
973
974 if ($1 eq 'C'){ # C on forward strand
975 $tri_nt = substr ($chromosomes{$chr},($pos-1),3); # positions are 0-based!
976 $strand = '+';
977 if ($tetra){
978 if ( length($chromosomes{$chr}) >= ($pos - 1 + 4) ){
979 $tetra_nt = substr ($chromosomes{$chr},($pos-1),4);
980 }
981 else{
982 $tetra_nt = '';
983 }
984 if ( length($chromosomes{$chr}) >= ($pos - 1 + 5) ){
985 $penta_nt = substr ($chromosomes{$chr},($pos-1),5);
986 }
987 else{
988 $penta_nt = '';
989 }
990 }
991 }
992 elsif ($1 eq 'G'){ # C on reverse strand
993 $tri_nt = substr ($chromosomes{$chr},($pos-3),3); # positions are 0-based!
994 $tri_nt = reverse $tri_nt;
995 $tri_nt =~ tr/ACTG/TGAC/;
996 $strand = '-';
997
998 if ($tetra){
999 if ( $pos - 4 >= 0 ){
1000 $tetra_nt = substr ($chromosomes{$chr},($pos-4),4);
1001 $tetra_nt = reverse $tetra_nt;
1002 $tetra_nt =~ tr/ACTG/TGAC/;
1003 }
1004 else{
1005 $tetra_nt = '';
1006 }
1007 if ( $pos - 5 >= 0 ){
1008 $penta_nt = substr ($chromosomes{$chr},($pos-5),5);
1009 $penta_nt = reverse $penta_nt;
1010 $penta_nt =~ tr/ACTG/TGAC/;
1011 }
1012 else{
1013 $penta_nt = '';
1014 }
1015 }
1016 $strand = '-';
1017 }
1018
1019 next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
1020
1021 ### determining cytosine context
1022 if ($tri_nt =~ /^CG/){
1023 $context = 'CG';
1024 }
1025 elsif ($tri_nt =~ /^C.{1}G$/){
1026 $context = 'CHG';
1027 }
1028 elsif ($tri_nt =~ /^C.{2}$/){
1029 $context = 'CHH';
1030 }
1031 else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
1032 warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n";
1033 next;
1034 }
1035
1036 if ($CpG_only){
1037 if ($tri_nt =~ /^CG/){ # CpG context is the default
1038 if ($zero){ # zero-based coordinates
1039 $pos -= 1;
1040 if ($tetra){
1041 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
1042 }
1043 else{
1044 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
1045 }
1046 }
1047 else{ # default
1048 if ($tetra){
1049 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
1050 }
1051 else{
1052 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
1053 }
1054 }
1055 }
1056 }
1057 else{ ## all cytosines, specified with --CX
1058 if ($zero){ # zero based coordinates
1059 $pos -= 1;
1060 if ($tetra){
1061 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
1062 }
1063 else{
1064 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
1065 }
1066 }
1067 else{ # default
1068 if ($tetra){
1069 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
1070 }
1071 else{
1072 print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
1073 }
1074 }
1075 }
1076 }
1077
1078 # close CYT or warn $!;
1079
1080 }
1081
1082
1083 sub read_genome_into_memory{
1084
1085 ## reading in and storing the specified genome in the %chromosomes hash
1086 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
1087 warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
1088
1089 my @chromosome_filenames = <*.fa>;
1090
1091 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
1092 unless (@chromosome_filenames){
1093 @chromosome_filenames = <*.fasta>;
1094 }
1095 unless (@chromosome_filenames){
1096 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
1097 }
1098
1099 foreach my $chromosome_filename (@chromosome_filenames){
1100
1101 # skipping the tophat entire mouse genome fasta file
1102 next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa');
1103
1104 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
1105 ### first line needs to be a fastA header
1106 my $first_line = <CHR_IN>;
1107 chomp $first_line;
1108 $first_line =~ s/\r//; # removing /r carriage returns
1109
1110 ### Extracting chromosome name from the FastA header
1111 my $chromosome_name = extract_chromosome_name($first_line);
1112
1113 my $sequence;
1114 while (<CHR_IN>){
1115 chomp;
1116 $_ =~ s/\r//; # removing /r carriage returns
1117
1118 if ($_ =~ /^>/){
1119 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
1120 if (exists $chromosomes{$chromosome_name}){
1121 warn "chr $chromosome_name (",length $sequence ," bp)\n";
1122 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
1123 }
1124 else {
1125 if (length($sequence) == 0){
1126 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
1127 }
1128 warn "chr $chromosome_name (",length $sequence ," bp)\n";
1129 $chromosomes{$chromosome_name} = $sequence;
1130 $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed
1131 }
1132 ### resetting the sequence variable
1133 $sequence = '';
1134 ### setting new chromosome name
1135 $chromosome_name = extract_chromosome_name($_);
1136 }
1137 else{
1138 $sequence .= uc$_;
1139 }
1140 }
1141
1142 if (exists $chromosomes{$chromosome_name}){
1143 warn "chr $chromosome_name (",length $sequence ," bp)\t";
1144 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
1145 }
1146 else{
1147 if (length($sequence) == 0){
1148 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
1149 }
1150 warn "chr $chromosome_name (",length $sequence ," bp)\n";
1151 $chromosomes{$chromosome_name} = $sequence;
1152 $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed
1153 }
1154 }
1155 warn "\n";
1156 chdir $parent_dir or die "Failed to move to directory $parent_dir\n";
1157 }
1158
1159 sub extract_chromosome_name {
1160 ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well
1161 my $fasta_header = shift;
1162 if ($fasta_header =~ s/^>//){
1163 my ($chromosome_name) = split (/\s+/,$fasta_header);
1164 return $chromosome_name;
1165 }
1166 else{
1167 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
1168 }
1169 }
1170
1171
1172 sub print_helpfile{
1173
1174 warn <<EOF
1175
1176 SYNOPSIS:
1177
1178 This script generates a cytosine methylation report for a genome of interest and a sorted methylation input file produced
1179 by the script "bismark2bedGraph". By default, the output uses 1-based chromosome coordinates and reports CpG positions only
1180 (for both strands individually and not merged in any way). Coordinates may be changed to 0-based using the option '--zero_based'.
1181
1182 The input file needs to have been generated with the script bismark2bedGraph (the file is called *.cov) or otherwise be
1183 sorted by position and exactly in the following format:
1184
1185 <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count unmethylated>
1186
1187 The coordinates of the input file are expected to be 1-based throughout (do not use files ending in .zero.cov!).
1188
1189
1190 USAGE: coverage2cytosine [options] --genome_folder <path> -o <output> [input]
1191
1192
1193 -o/--output <filename> Name of the output file, mandatory.
1194
1195 --dir Output directory. Output is written to the current directory if not specified explicitly.
1196
1197 --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
1198 formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.
1199
1200 -CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of
1201 its context. This applies to both forward and reverse strands. Please be aware that this will
1202 generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
1203 Default: OFF (i.e. Default = CpG context only).
1204
1205 --merge_CpG Using this option will post-process the genome-wide report to write out an additional coverage
1206 file (see above for the coverage file format) which has the top and bottom strand methylation
1207 evidence pooled into a single CpG dinucleotide entity. This may be the desirable input format
1208 for some downstream processing tools such as the R-package bsseq (by K.D. Hansen). An example would be:
1209
1210 genome-wide CpG report (old)
1211 gi|9626372|ref|NC_001422.1| 157 + 313 156 CG
1212 gi|9626372|ref|NC_001422.1| 158 - 335 156 CG
1213 merged CpG evidence coverage file (new)
1214 gi|9626372|ref|NC_001422.1| 157 158 67.500000 648 312
1215
1216 This option is currently experimental, and only works if CpG context only and a single genome-wide report
1217 were specified (i.e. it doesn't work with the options --CX or --split_by_chromosome).
1218
1219 --gc/--gc_context In addition to normal processing this option will reprocess the genome to find methylation in
1220 GpC context. This might be useful for specialist applications where GpC methylases had been
1221 deployed. The output format is exactly the same as for the normal cytosine report, and only
1222 positions covered by at least one read are reported (output file ends in .GpC_report.txt). In addition
1223 this will write out a Bismark coverage file (ending in GpC.cov).
1224
1225 --ff In addition to the standard output selecting --ff will also extract a four and five (tetra/penta)
1226 nucleotide context for the cytosines in question. Too short sequences (e.g. at the edges of the
1227 chromosome) will be left blank; sequences containing Ns are ignored.
1228
1229 --zero_based Uses 0-based coordinates instead of 1-based coordinates throughout. Default: OFF.
1230
1231 --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files
1232 will be named to include the input filename and the chromosome number.
1233
1234 --gzip Output file will be GZIP compressed (ending in .gz). Only works for standard CpG- and CX-output.
1235
1236 --help Displays this help message and exits
1237
1238
1239
1240 OUTPUT FORMAT:
1241
1242 The genome-wide cytosine methylation output file is tab-delimited in the following format (1-based coords):
1243 ===========================================================================================================
1244
1245 <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context>
1246
1247
1248 Script last modified: 04 April 2016
1249
1250 EOF
1251 ;
1252 exit 1;
1253 }
1254