Mercurial > repos > bgruening > bismark
view bismark_methyl_extractor/coverage2cytosine @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author | bgruening |
---|---|
date | Sat, 06 May 2017 13:18:09 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use warnings; use strict; $|++; use Getopt::Long; use Cwd; use Carp; ## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <http://www.gnu.org/licenses/>. my %chromosomes; # storing sequence information of all chromosomes/scaffolds my %processed; # keeping a record of which chromosomes have been processed my $coverage2cytosine_version = 'v0.16.3'; my ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra) = process_commandline(); warn "Summary of parameters for genome-wide cytosine report:\n"; warn '='x78,"\n"; warn "Coverage infile:\t\t\t$coverage_infile\n"; warn "Output directory:\t\t\t>$output_dir<\n"; warn "Parent directory:\t\t\t>$parent_dir<\n"; warn "Genome directory:\t\t\t>$genome_folder<\n"; if ($CX_context){ warn "CX context:\t\t\t\tyes\n"; } else{ warn "CX context:\t\t\t\tno (CpG context only, default)\n"; } if ($merge_CpGs){ warn "Pooling CpG top/bottom strand evidence:\tyes\n"; } if($gc_context){ warn "Optional GC context track:\t\tyes\n"; } if ($tetra){ warn "Tetra/Penta nucleotide context:\t\tyes\n"; } if ($zero){ warn "Genome coordinates used:\t\t0-based (user specified)\n"; } else{ warn "Genome coordinates used:\t\t1-based (default)\n"; } if ($gzip){ warn "GZIP compression:\t\t\tyes\n"; } else{ warn "GZIP compression:\t\t\tno\n"; } if ($split_by_chromosome){ warn "Split by chromosome:\t\t\tyes\n\n\n"; } else{ warn "Split by chromosome:\t\t\tno\n\n\n"; } sleep (3); read_genome_into_memory(); warn "Stored sequence information of ",scalar keys %chromosomes," chromosomes/scaffolds in total\n\n"; generate_genome_wide_cytosine_report($coverage_infile); ### 11 December 2014 # The following optional section re-reads the genome-wide report and merges methylation evidence of both top and bottom strand # into a single CpG dinucleotide entity. This significantly simplifies downstream processing, e.g. by the bsseq R-/Bioconductor package # which recommends this merging process to increase coverage per CpG and reduce the memory burden for its processing if ($merge_CpGs){ # we only allow this operation if the report is limited to CpG context, and for a single report for the entire genome for the time being combine_CpGs_to_single_CG_entity($cytosine_out); } ### 18 August 2015 # The following section reprocessed the genome to generate cytosine methylation output in GC context (e.g. when a GpC methylase had been deployed if ($gc_context){ generate_GC_context_report($coverage_infile); } sub combine_CpGs_to_single_CG_entity{ my $CpG_report_file = shift; warn "Now merging top and bottom strand CpGs into a single CG dinucleotide entity\n"; open (IN,$CpG_report_file) or die "Failed to open file $CpG_report_file: $!\n\n"; my $pooled_CG = $CpG_report_file; $pooled_CG =~ s/$/.merged_CpG_evidence.cov/; open (OUT,'>',$pooled_CG) or die "Failed to write to file '$pooled_CG': $!\n\n"; warn ">>> Writing a new coverage file with top and bottom strand CpG methylation evidence merged to $pooled_CG <<<\n\n"; sleep(1); while (1){ my $line1 = <IN>; my $line2 = <IN>; last unless ($line1 and $line2); chomp $line1; chomp $line2; my ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1); my ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2); # print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n"; # print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n"; sleep(1); if ($pos1 < 2){ $line1 = $line2; $line2 = <IN>; chomp $line2; ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1); ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2); } # a few sanity checks die "The context of the first line was not CG:\t$line1" unless ($context1 eq 'CG'); die "The context of the second line was not CG:\t$line2" unless ($context2 eq 'CG'); unless ($strand1 eq '+' and $strand2 eq'-'){ die "The strand of line 1 and line 2 were not + and -:\nline1:\t$line1\nline2:\t$line2\n"; } unless ($pos2 eq ($pos1 + 1)){ die "The reported position were not spaced 1bp apart:\nline1:\t$line1\nline2:\t$line2\n"; } unless($chr1 eq $chr2){ die "The chromsome information for line 1 and 2 did not match:\nline1:\t$line1\nline2:\t$line2\n"; } # This looks alright from what I can tell, lets pool the 2 strands my $pooled_m = $m1 + $m2; my $pooled_u = $u1 + $u2; # since this is a new coverage file we only write it out if the coverage is at least 1 next unless ($pooled_m + $pooled_u) > 0; my $pooled_percentage = sprintf ("%.6f",$pooled_m/ ($pooled_m + $pooled_u) *100); # print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n"; # print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n"; # print join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n"; # sleep(1); print OUT join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n"; } warn "CpG merging complete. Good luck finding DMRs with bsseq!\n\n"; } sub process_commandline{ my $help; my $output_dir; my $genome_folder; my $zero; my $CpG_only; my $CX_context; my $gzip; my $split_by_chromosome; my $cytosine_out; my $parent_dir; my $version; my $merge_CpGs; my $gc_context; my $tetra; my $command_line = GetOptions ('help|man' => \$help, 'dir=s' => \$output_dir, 'g|genome_folder=s' => \$genome_folder, "zero_based" => \$zero, "CX|CX_context" => \$CX_context, "split_by_chromosome" => \$split_by_chromosome, 'o|output=s' => \$cytosine_out, 'parent_dir=s' => \$parent_dir, 'version' => \$version, 'merge_CpGs' => \$merge_CpGs, 'GC|GC_context' => \$gc_context, 'ff|qq' => \$tetra, 'gzip' => \$gzip, ); ### EXIT ON ERROR if there were errors with any of the supplied options unless ($command_line){ die "Please respecify command line options\n"; } ### HELPFILE if ($help){ print_helpfile(); exit; } if ($version){ print << "VERSION"; Bismark Methylation Extractor Module - coverage2cytosine Bismark Extractor Version: $coverage2cytosine_version Copyright 2010-15 Felix Krueger, Babraham Bioinformatics www.bioinformatics.babraham.ac.uk/projects/bismark/ VERSION exit; } ### no files provided unless (@ARGV){ warn "You need to provide a Bismark coverage file (with counts methylated/unmethylated cytosines) to create an individual C methylation output. Please respecify!\n"; sleep(2); print_helpfile(); exit; } my $coverage_infile = shift @ARGV; unless ($parent_dir){ $parent_dir = getcwd(); } unless ($parent_dir =~ /\/$/){ $parent_dir =~ s/$/\//; } unless (defined $cytosine_out){ die "Please provide the name of the output file using the option -o/--output filename\n"; } ### OUTPUT DIR PATH if (defined $output_dir){ unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it unless ($output_dir =~ /\/$/){ $output_dir =~ s/$/\//; } } } else{ $output_dir = ''; } unless ($CX_context){ $CX_context = 0; $CpG_only = 1; } ### GENOME folder if ($genome_folder){ unless ($genome_folder =~/\/$/){ $genome_folder =~ s/$/\//; } } else{ die "Please specify a genome folder to proceed (full path only)\n"; } if ($merge_CpGs){ if ($CX_context){ die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if CpG-context is selected only (lose the option --CX)\n"; } if ($split_by_chromosome){ die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if a single CpG report is written out (lose the option --split_by_chromosome)\n"; } } return ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra); } sub generate_genome_wide_cytosine_report { warn "="x78,"\n"; warn "Methylation information will now be written into a genome-wide cytosine report\n"; warn "="x78,"\n\n"; sleep (2); my $number_processed = 0; ### changing to the output directory again unless ($output_dir eq ''){ # default chdir $output_dir or die "Failed to change directory to $output_dir\n"; # warn "Changed directory to $output_dir\n"; } my $in = shift; # infiles handed over by the methylation extractor will be just the filename on their own. The directory should have been handed over with --dir if ($in =~ /gz$/){ open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n"; # changed from gunzip -c to gunzip -c 08 04 16 } else{ open (IN,"$in") or die "Failed to read from file $in: $!\n"; } ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands unless ($split_by_chromosome){ ### writing all output to a single file (default) if ($gzip){ unless ($cytosine_out =~ /\.gz$/){ $cytosine_out .= '.gz'; } open (CYT,"| gzip -c - > $cytosine_out") or die "Failed to write to file $cytosine_out: $!\n"; } else{ open (CYT,'>',$cytosine_out) or die "Failed to write to file $cytosine_out: $!\n"; } warn ">>> Writing genome-wide cytosine report to: $cytosine_out <<<\n\n"; sleep (1); } my $last_chr; my %chr; # storing reads for one chromosome at a time my $count = 0; while (<IN>){ chomp; ++$count; my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/); # defining the first chromosome unless (defined $last_chr){ $last_chr = $chr; ++$number_processed; # warn "Storing all covered cytosine positions for chromosome: $chr\n"; } ### As of version 0.9.1 the start positions are 1-based! if ($chr eq $last_chr){ $chr{$chr}->{$start}->{meth} = $meth; $chr{$chr}->{$start}->{nonmeth} = $nonmeth; } else{ warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; ++$number_processed; if ($split_by_chromosome){ ## writing output to 1 file per chromosome my $chromosome_out = $cytosine_out; if ($chromosome_out =~ /\.txt$/){ $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/; } else{ $chromosome_out =~ s/$/.chr${last_chr}.txt/; } open (CYT,'>',$chromosome_out) or die $!; # warn "Writing output for $last_chr to $chromosome_out\n"; } my $tri_nt; my $tetra_nt; my $penta_nt; my $context; $processed{$last_chr} = 1; while ( $chromosomes{$last_chr} =~ /([CG])/g){ $tri_nt = ''; $context = ''; if ($tetra){ $tetra_nt = ''; # clearing $penta_nt = ''; } my $pos = pos$chromosomes{$last_chr}; my $strand; my $meth = 0; my $nonmeth = 0; if ($1 eq 'C'){ # C on forward strand $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! if ($tetra){ if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){ $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4); } else{ $tetra_nt = ''; } if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){ $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5); } else{ $penta_nt = ''; } } $strand = '+'; } elsif ($1 eq 'G'){ # C on reverse strand $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! $tri_nt = reverse $tri_nt; $tri_nt =~ tr/ACTG/TGAC/; if ($tetra){ if ( $pos - 4 >= 0 ){ $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4); $tetra_nt = reverse $tetra_nt; $tetra_nt =~ tr/ACTG/TGAC/; } else{ $tetra_nt = ''; } if ( $pos - 5 >= 0 ){ $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5); $penta_nt = reverse $penta_nt; $penta_nt =~ tr/ACTG/TGAC/; } else{ $penta_nt = ''; } } $strand = '-'; } next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted # if (length$penta_nt < 5){ # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1); # } if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! (as of v0.9.1) $meth = $chr{$last_chr}->{$pos}->{meth}; $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth}; } ### determining cytosine context if ($tri_nt =~ /^CG/){ $context = 'CG'; } elsif ($tri_nt =~ /^C.{1}G$/){ $context = 'CHG'; } elsif ($tri_nt =~ /^C.{2}$/){ $context = 'CHH'; } else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n"; next; } if ($CpG_only){ if ($tri_nt =~ /^CG/){ # CpG context is the default if ($zero){ # zero based coordinates $pos -= 1; if ($tetra){ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; } else{ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; } } else{ # default if ($tetra){ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; } else{ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; } } } } else{ ## all cytosines, specified with --CX if ($zero){ # zero based coordinates $pos -= 1; if ($tetra){ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; } else{ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n" } } else{ # default if ($tetra){ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; } else{ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; } } } } %chr = (); # resetting the hash # new first entry $last_chr = $chr; $chr{$chr}->{$start}->{meth} = $meth; $chr{$chr}->{$start}->{nonmeth} = $nonmeth; } } # Last found chromosome # If there never was a last chromosome then something must have gone wrong with reading the data in unless (defined $last_chr){ die "No last chromosome was defined, something must have gone wrong while reading the data in (e.g. specified wrong file path for a gzipped coverage file?). Please check your command!\n\n"; } warn "Writing cytosine report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; $processed{$last_chr} = 1; if ($split_by_chromosome){ ## writing output to 1 file per chromosome my $chromosome_out = $cytosine_out; if ($chromosome_out =~ /\.txt$/){ # files passed on by the methylation extractor end in _report.txt $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/; } else{ # user specified output file name $chromosome_out =~ s/$/.chr${last_chr}.txt/; } open (CYT,'>',$chromosome_out) or die $!; # warn "Writing output for $last_chr to $chromosome_out\n"; } my $tri_nt; my $tetra_nt; my $penta_nt; my $context; while ( $chromosomes{$last_chr} =~ /([CG])/g){ $tri_nt = ''; $context = ''; if ($tetra){ $tetra_nt = ''; $penta_nt = ''; } my $pos = pos$chromosomes{$last_chr}; my $strand; my $meth = 0; my $nonmeth = 0; if ($1 eq 'C'){ # C on forward strand $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! $strand = '+'; if ($tetra){ if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){ $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4); } else{ $tetra_nt = ''; } if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){ $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5); } else{ $penta_nt = ''; } } } elsif ($1 eq 'G'){ # C on reverse strand $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! $tri_nt = reverse $tri_nt; $tri_nt =~ tr/ACTG/TGAC/; $strand = '-'; if ($tetra){ if ( $pos - 4 >= 0 ){ $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4); $tetra_nt = reverse $tetra_nt; $tetra_nt =~ tr/ACTG/TGAC/; } else{ $tetra_nt = ''; } if ( $pos - 5 >= 0 ){ $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5); $penta_nt = reverse $penta_nt; $penta_nt =~ tr/ACTG/TGAC/; } else{ $penta_nt = ''; } } } if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! as of v0.9.1 $meth = $chr{$last_chr}->{$pos}->{meth}; $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth}; } next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted # if (length$penta_nt < 5){ # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1); # } ### determining cytosine context if ($tri_nt =~ /^CG/){ $context = 'CG'; } elsif ($tri_nt =~ /^C.{1}G$/){ $context = 'CHG'; } elsif ($tri_nt =~ /^C.{2}$/){ $context = 'CHH'; } else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n"; next; } if ($CpG_only){ if ($tri_nt =~ /^CG/){ # CpG context is the default if ($zero){ # zero-based coordinates $pos -= 1; if ($tetra){ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; } else{ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; } } else{ # default if ($tetra){ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; } else{ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; } } } } else{ ## all cytosines, specified with --CX if ($zero){ # zero based coordinates $pos -= 1; if ($tetra){ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; } else{ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; } } else{ # default if ($tetra){ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; } else{ print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; } } } } warn "Finished writing out cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n"; ### Now processing chromosomes that were not covered in the coverage file warn "Now processing chromosomes that were not covered by any methylation calls in the coverage file...\n"; my $unprocessed = 0; foreach my $chr (sort keys %processed) { unless ( $processed{$chr} ) { ++$unprocessed; ++$number_processed; process_unprocessed_chromosomes($chr); } } if ($unprocessed == 0) { warn "All chromosomes in the genome were covered by at least some reads. coverage2cytosine processing complete.\n\n"; } else{ warn "Finished writing out cytosine report (processed $number_processed chromosomes/scaffolds in total). coverage2cytosine processing complete.\n\n"; } close CYT or warn $!; } #### GC CONTEXT - optional #### sub generate_GC_context_report { warn "="x82,"\n"; warn "Methylation information for GC context will now be written to a GpC-context report\n"; warn "="x82,"\n\n"; sleep (2); my $number_processed = 0; ### changing to the output directory again unless ($output_dir eq ''){ # default chdir $output_dir or die "Failed to change directory to $output_dir\n"; # warn "Changed directory to $output_dir\n"; } my $in = shift; if ($in =~ /gz$/){ open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n"; } else{ open (IN,"$in") or die "Failed to read from file $in: $!\n"; } my $gc_out = $cytosine_out.'.GpC_report.txt'; my $gc_cov = $cytosine_out.'.GpC.cov'; ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands open (GC,'>',$gc_out) or die "Failed to write to GpC file $gc_out: !\n\n"; warn ">>> Writing genome-wide GpC cytosine report to: $gc_out <<<\n"; open (GCCOV,'>',$gc_cov) or die "Failed to write to GpC coverage file $gc_cov: !\n\n"; warn ">>> Writing genome-wide GpC coverage file to: $gc_cov <<<\n\n"; my $last_chr; my %chr; # storing reads for one chromosome at a time my $count = 0; while (<IN>){ chomp; ++$count; my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/); # defining the first chromosome unless (defined $last_chr){ $last_chr = $chr; ++$number_processed; # warn "Storing all covered cytosine positions for chromosome: $chr\n"; } ### start positions are 1-based if ($chr eq $last_chr){ $chr{$chr}->{$start}->{meth} = $meth; $chr{$chr}->{$start}->{nonmeth} = $nonmeth; } else{ warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; ++$number_processed; $processed{$last_chr} = 1; while ( $chromosomes{$last_chr} =~ /(GC)/g){ # a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time my $context_top = ''; my $context_bottom = ''; my $pos = pos$chromosomes{$last_chr}; my $meth_top = 0; my $meth_bottom = 0; my $nonmeth_top = 0; my $nonmeth_bottom = 0; #warn "$1\n"; sleep(1); # C on forward strand my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! my $strand_top = '+'; # C on reverse strand my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3); # positions are 0-based! $tri_nt_bottom = reverse $tri_nt_bottom; $tri_nt_bottom =~ tr/ACTG/TGAC/; my $strand_bottom = '-'; next if (length $tri_nt_top < 3); # trinucleotide sequence could not be extracted for the top strand next if (length $tri_nt_bottom < 3); # trinucleotide sequence could not be extracted for the reverse strand # top strand if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! $meth_top = $chr{$last_chr}->{$pos}->{meth}; $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth}; } # bottom strand if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs $meth_bottom = $chr{$last_chr}->{$pos-1}->{meth}; $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth}; } ### determining cytosine context top strand if ($tri_nt_top =~ /^CG/){ $context_top = 'CG'; } elsif ($tri_nt_top =~ /^C.{1}G$/){ $context_top = 'CHG'; } elsif ($tri_nt_top =~ /^C.{2}$/){ $context_top = 'CHH'; } else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n"; next; } ### determining cytosine context bottom strand if ($tri_nt_bottom =~ /^CG/){ $context_bottom = 'CG'; } elsif ($tri_nt_bottom =~ /^C.{1}G$/){ $context_bottom = 'CHG'; } elsif ($tri_nt_bottom =~ /^C.{2}$/){ $context_bottom = 'CHH'; } else{ # if the context can't be determined the positions will not be printed warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n"; next; } # if Cs were not covered at all they are not written out unless ($meth_bottom + $nonmeth_bottom == 0){ my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100); print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n"; print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n"; } unless ($meth_top + $nonmeth_top == 0){ my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100); print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n"; print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n"; } } %chr = (); # resetting the hash # new first entry $last_chr = $chr; $chr{$chr}->{$start}->{meth} = $meth; $chr{$chr}->{$start}->{nonmeth} = $nonmeth; } } # Last found chromosome warn "Writing cytosine GpC report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; $processed{$last_chr} = 1; while ( $chromosomes{$last_chr} =~ /(GC)/g){ # a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time my $context_top = ''; my $context_bottom = ''; my $pos = pos$chromosomes{$last_chr}; my $meth_top = 0; my $meth_bottom = 0; my $nonmeth_top = 0; my $nonmeth_bottom = 0; # C on forward strand my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! my $strand_top = '+'; # C on reverse strand my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3); # positions are 0-based! $tri_nt_bottom = reverse $tri_nt_bottom; $tri_nt_bottom =~ tr/ACTG/TGAC/; my $strand_bottom = '-'; next if (length $tri_nt_top < 3); # trinucleotide sequence could not be extracted for the top strand next if (length $tri_nt_bottom < 3); # trinucleotide sequence could not be extracted for the bottom strand ### determining cytosine context top strand if ($tri_nt_top =~ /^CG/){ $context_top = 'CG'; } elsif ($tri_nt_top =~ /^C.{1}G$/){ $context_top = 'CHG'; } elsif ($tri_nt_top =~ /^C.{2}$/){ $context_top = 'CHH'; } else{ # if the context can't be determined the positions will not be printed warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n"; next; } ### determining cytosine context bottom strand if ($tri_nt_bottom =~ /^CG/){ $context_bottom = 'CG'; } elsif ($tri_nt_bottom =~ /^C.{1}G$/){ $context_bottom = 'CHG'; } elsif ($tri_nt_bottom =~ /^C.{2}$/){ $context_bottom = 'CHH'; } else{ # if the context can't be determined the positions will not be printed warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n"; next; } # top strand if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! $meth_top = $chr{$last_chr}->{$pos}->{meth}; $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth}; } # bottom strand if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs $meth_bottom = $chr{$last_chr}->{$pos-1}->{meth}; $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth}; } # if Cs were not covered at all they are not written out unless ($meth_bottom + $nonmeth_bottom == 0){ my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100); print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n"; print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n"; } unless ($meth_top + $nonmeth_top == 0){ my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100); print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n"; print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n"; } } warn "Finished writing out GpC cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n"; close GC or warn $!; } #### #### sub process_unprocessed_chromosomes{ my $chr = shift; warn "Writing cytosine report for not covered chromosome $chr\n"; $processed{$chr} = 1; if ($split_by_chromosome){ ## writing output to 1 file per chromosome my $chromosome_out = $cytosine_out; if ($chromosome_out =~ /txt$/){ # files passed on by the methylation extractor end in _report.txt $chromosome_out =~ s/txt$/chr${chr}.txt/; } else{ # user specified output file name $chromosome_out =~ s/$/.chr${chr}.txt/; } open (CYT,'>',$chromosome_out) or die $!; # warn "Writing output for $last_chr to $chromosome_out\n"; } my $tri_nt; my $tetra_nt; my $penta_nt; my $context; while ( $chromosomes{$chr} =~ /([CG])/g){ $tri_nt = ''; $context = ''; if ($tetra){ $tetra_nt = ''; # clearing $penta_nt = ''; } my $pos = pos$chromosomes{$chr}; my $strand; my $meth = 0; my $nonmeth = 0; if ($1 eq 'C'){ # C on forward strand $tri_nt = substr ($chromosomes{$chr},($pos-1),3); # positions are 0-based! $strand = '+'; if ($tetra){ if ( length($chromosomes{$chr}) >= ($pos - 1 + 4) ){ $tetra_nt = substr ($chromosomes{$chr},($pos-1),4); } else{ $tetra_nt = ''; } if ( length($chromosomes{$chr}) >= ($pos - 1 + 5) ){ $penta_nt = substr ($chromosomes{$chr},($pos-1),5); } else{ $penta_nt = ''; } } } elsif ($1 eq 'G'){ # C on reverse strand $tri_nt = substr ($chromosomes{$chr},($pos-3),3); # positions are 0-based! $tri_nt = reverse $tri_nt; $tri_nt =~ tr/ACTG/TGAC/; $strand = '-'; if ($tetra){ if ( $pos - 4 >= 0 ){ $tetra_nt = substr ($chromosomes{$chr},($pos-4),4); $tetra_nt = reverse $tetra_nt; $tetra_nt =~ tr/ACTG/TGAC/; } else{ $tetra_nt = ''; } if ( $pos - 5 >= 0 ){ $penta_nt = substr ($chromosomes{$chr},($pos-5),5); $penta_nt = reverse $penta_nt; $penta_nt =~ tr/ACTG/TGAC/; } else{ $penta_nt = ''; } } $strand = '-'; } next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted ### determining cytosine context if ($tri_nt =~ /^CG/){ $context = 'CG'; } elsif ($tri_nt =~ /^C.{1}G$/){ $context = 'CHG'; } elsif ($tri_nt =~ /^C.{2}$/){ $context = 'CHH'; } else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n"; next; } if ($CpG_only){ if ($tri_nt =~ /^CG/){ # CpG context is the default if ($zero){ # zero-based coordinates $pos -= 1; if ($tetra){ print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; } else{ print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; } } else{ # default if ($tetra){ print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; } else{ print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; } } } } else{ ## all cytosines, specified with --CX if ($zero){ # zero based coordinates $pos -= 1; if ($tetra){ print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; } else{ print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; } } else{ # default if ($tetra){ print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; } else{ print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; } } } } # close CYT or warn $!; } sub read_genome_into_memory{ ## reading in and storing the specified genome in the %chromosomes hash chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; my @chromosome_filenames = <*.fa>; ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta unless (@chromosome_filenames){ @chromosome_filenames = <*.fasta>; } unless (@chromosome_filenames){ die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; } foreach my $chromosome_filename (@chromosome_filenames){ # skipping the tophat entire mouse genome fasta file next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa'); open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; ### first line needs to be a fastA header my $first_line = <CHR_IN>; chomp $first_line; $first_line =~ s/\r//; # removing /r carriage returns ### Extracting chromosome name from the FastA header my $chromosome_name = extract_chromosome_name($first_line); my $sequence; while (<CHR_IN>){ chomp; $_ =~ s/\r//; # removing /r carriage returns if ($_ =~ /^>/){ ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) if (exists $chromosomes{$chromosome_name}){ warn "chr $chromosome_name (",length $sequence ," bp)\n"; die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; } else { if (length($sequence) == 0){ warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; } warn "chr $chromosome_name (",length $sequence ," bp)\n"; $chromosomes{$chromosome_name} = $sequence; $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed } ### resetting the sequence variable $sequence = ''; ### setting new chromosome name $chromosome_name = extract_chromosome_name($_); } else{ $sequence .= uc$_; } } if (exists $chromosomes{$chromosome_name}){ warn "chr $chromosome_name (",length $sequence ," bp)\t"; die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; } else{ if (length($sequence) == 0){ warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; } warn "chr $chromosome_name (",length $sequence ," bp)\n"; $chromosomes{$chromosome_name} = $sequence; $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed } } warn "\n"; chdir $parent_dir or die "Failed to move to directory $parent_dir\n"; } sub extract_chromosome_name { ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well my $fasta_header = shift; if ($fasta_header =~ s/^>//){ my ($chromosome_name) = split (/\s+/,$fasta_header); return $chromosome_name; } else{ die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; } } sub print_helpfile{ warn <<EOF SYNOPSIS: This script generates a cytosine methylation report for a genome of interest and a sorted methylation input file produced by the script "bismark2bedGraph". By default, the output uses 1-based chromosome coordinates and reports CpG positions only (for both strands individually and not merged in any way). Coordinates may be changed to 0-based using the option '--zero_based'. The input file needs to have been generated with the script bismark2bedGraph (the file is called *.cov) or otherwise be sorted by position and exactly in the following format: <chromosome> <start position> <end position> <methylation percentage> <count methylated> <count unmethylated> The coordinates of the input file are expected to be 1-based throughout (do not use files ending in .zero.cov!). USAGE: coverage2cytosine [options] --genome_folder <path> -o <output> [input] -o/--output <filename> Name of the output file, mandatory. --dir Output directory. Output is written to the current directory if not specified explicitly. --genome_folder <path> Enter the genome folder you wish to use to extract sequences from (full path only). Accepted formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. -CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of its context. This applies to both forward and reverse strands. Please be aware that this will generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. Default: OFF (i.e. Default = CpG context only). --merge_CpG Using this option will post-process the genome-wide report to write out an additional coverage file (see above for the coverage file format) which has the top and bottom strand methylation evidence pooled into a single CpG dinucleotide entity. This may be the desirable input format for some downstream processing tools such as the R-package bsseq (by K.D. Hansen). An example would be: genome-wide CpG report (old) gi|9626372|ref|NC_001422.1| 157 + 313 156 CG gi|9626372|ref|NC_001422.1| 158 - 335 156 CG merged CpG evidence coverage file (new) gi|9626372|ref|NC_001422.1| 157 158 67.500000 648 312 This option is currently experimental, and only works if CpG context only and a single genome-wide report were specified (i.e. it doesn't work with the options --CX or --split_by_chromosome). --gc/--gc_context In addition to normal processing this option will reprocess the genome to find methylation in GpC context. This might be useful for specialist applications where GpC methylases had been deployed. The output format is exactly the same as for the normal cytosine report, and only positions covered by at least one read are reported (output file ends in .GpC_report.txt). In addition this will write out a Bismark coverage file (ending in GpC.cov). --ff In addition to the standard output selecting --ff will also extract a four and five (tetra/penta) nucleotide context for the cytosines in question. Too short sequences (e.g. at the edges of the chromosome) will be left blank; sequences containing Ns are ignored. --zero_based Uses 0-based coordinates instead of 1-based coordinates throughout. Default: OFF. --split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files will be named to include the input filename and the chromosome number. --gzip Output file will be GZIP compressed (ending in .gz). Only works for standard CpG- and CX-output. --help Displays this help message and exits OUTPUT FORMAT: The genome-wide cytosine methylation output file is tab-delimited in the following format (1-based coords): =========================================================================================================== <chromosome> <position> <strand> <count methylated> <count non-methylated> <C-context> <trinucleotide context> Script last modified: 04 April 2016 EOF ; exit 1; }