Mercurial > repos > bgruening > bismark

#!/usr/bin/env perl
use warnings;
use strict;
$|++;
use Getopt::Long;
use Cwd;
use Carp;

## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk)

## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.

my %chromosomes; # storing sequence information of all chromosomes/scaffolds
my %processed;   # keeping a record of which chromosomes have been processed
my $coverage2cytosine_version = 'v0.16.3';

my ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra) = process_commandline();

warn "Summary of parameters for genome-wide cytosine report:\n";
warn '='x78,"\n";
warn "Coverage infile:\t\t\t$coverage_infile\n";
warn "Output directory:\t\t\t>$output_dir<\n";
warn "Parent directory:\t\t\t>$parent_dir<\n";
warn "Genome directory:\t\t\t>$genome_folder<\n";

if ($CX_context){
  warn "CX context:\t\t\t\tyes\n";
}
else{
  warn "CX context:\t\t\t\tno (CpG context only, default)\n";
}
if ($merge_CpGs){
  warn "Pooling CpG top/bottom strand evidence:\tyes\n";
}
if($gc_context){
  warn "Optional GC context track:\t\tyes\n";
}
if ($tetra){
    warn "Tetra/Penta nucleotide context:\t\tyes\n";
}

if ($zero){
  warn "Genome coordinates used:\t\t0-based (user specified)\n";
}
else{
  warn "Genome coordinates used:\t\t1-based (default)\n";
}

if ($gzip){
    warn "GZIP compression:\t\t\tyes\n";
}
else{
    warn "GZIP compression:\t\t\tno\n";
}

if ($split_by_chromosome){
  warn "Split by chromosome:\t\t\tyes\n\n\n";
}
else{
  warn "Split by chromosome:\t\t\tno\n\n\n";
}
sleep (3);

read_genome_into_memory();
warn "Stored sequence information of ",scalar keys %chromosomes," chromosomes/scaffolds in total\n\n";

generate_genome_wide_cytosine_report($coverage_infile);

### 11 December 2014

# The following optional section re-reads the genome-wide report and merges methylation evidence of both top and bottom strand
# into a single CpG dinucleotide entity. This significantly simplifies downstream processing, e.g. by the bsseq R-/Bioconductor package
# which recommends this merging process to increase coverage per CpG and reduce the memory burden for its processing

if ($merge_CpGs){
  # we only allow this operation if the report is limited to CpG context, and for a single report for the entire genome for the time being
  combine_CpGs_to_single_CG_entity($cytosine_out);
}

### 18 August 2015

# The following section reprocessed the genome to generate cytosine methylation output in GC context (e.g. when a GpC methylase had been deployed
if ($gc_context){
    generate_GC_context_report($coverage_infile);
}


sub combine_CpGs_to_single_CG_entity{
  my $CpG_report_file = shift;
  warn "Now merging top and bottom strand CpGs into a single CG dinucleotide entity\n";

  open (IN,$CpG_report_file) or die "Failed to open file $CpG_report_file: $!\n\n";
  my $pooled_CG = $CpG_report_file;
  $pooled_CG =~ s/$/.merged_CpG_evidence.cov/;
  open (OUT,'>',$pooled_CG) or die "Failed to write to file '$pooled_CG': $!\n\n";
  warn ">>> Writing a new coverage file with top and bottom strand CpG methylation evidence merged to $pooled_CG <<<\n\n";
  sleep(1);

  while (1){
    my $line1 = <IN>;
    my $line2 = <IN>;
    last unless ($line1 and $line2);

    chomp $line1;
    chomp $line2;

    my ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1);
    my ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2);
    # print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n";
    # print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n"; sleep(1);

    if ($pos1 < 2){
      $line1 = $line2;
      $line2 = <IN>;
      chomp $line2;
      ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1);
      ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2);
    }

    # a few sanity checks
    die "The context of the first line was not CG:\t$line1"  unless ($context1 eq 'CG');
    die "The context of the second line was not CG:\t$line2" unless ($context2 eq 'CG');

    unless ($strand1 eq '+' and $strand2 eq'-'){
      die "The strand of line 1 and line 2 were not + and -:\nline1:\t$line1\nline2:\t$line2\n";
    }
    unless ($pos2 eq ($pos1 + 1)){
      die "The reported position were not spaced 1bp apart:\nline1:\t$line1\nline2:\t$line2\n";
    }
    unless($chr1 eq $chr2){
      die "The chromsome information for line 1 and 2 did not match:\nline1:\t$line1\nline2:\t$line2\n";
    }

    # This looks alright from what I can tell, lets pool the 2 strands

    my $pooled_m = $m1 + $m2;
    my $pooled_u = $u1 + $u2;

    # since this is a new coverage file we only write it out if the coverage is at least 1
    next unless ($pooled_m + $pooled_u) > 0;

    my $pooled_percentage = sprintf ("%.6f",$pooled_m/ ($pooled_m + $pooled_u) *100);
    # print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n";
    # print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n";
    # print join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n";
    # sleep(1);
    print OUT join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n";
  }

  warn "CpG merging complete. Good luck finding DMRs with bsseq!\n\n";

}


sub process_commandline{
  my $help;
  my $output_dir;
  my $genome_folder;
  my $zero;
  my $CpG_only;
  my $CX_context;
  my $gzip;

  my $split_by_chromosome;
  my $cytosine_out;
  my $parent_dir;
  my $version;
  my $merge_CpGs;
  my $gc_context;
  my $tetra;

  my $command_line = GetOptions ('help|man' => \$help,
				 'dir=s' => \$output_dir,
				 'g|genome_folder=s' => \$genome_folder,
				 "zero_based" => \$zero,
				 "CX|CX_context" => \$CX_context,
				 "split_by_chromosome" => \$split_by_chromosome,
				 'o|output=s' => \$cytosine_out,
				 'parent_dir=s' => \$parent_dir,
				 'version' => \$version,
				 'merge_CpGs' => \$merge_CpGs,
				 'GC|GC_context' => \$gc_context,
				 'ff|qq' => \$tetra,
				 'gzip' => \$gzip,
      );

  ### EXIT ON ERROR if there were errors with any of the supplied options
  unless ($command_line){
      die "Please respecify command line options\n";
  }

  ### HELPFILE
  if ($help){
    print_helpfile();
    exit;
  }

  if ($version){
    print << "VERSION";


                      Bismark Methylation Extractor Module -
                               coverage2cytosine

                      Bismark Extractor Version: $coverage2cytosine_version
              Copyright 2010-15 Felix Krueger, Babraham Bioinformatics
                www.bioinformatics.babraham.ac.uk/projects/bismark/


VERSION
    exit;
  }

  ### no files provided
  unless (@ARGV){
    warn "You need to provide a Bismark coverage file (with counts methylated/unmethylated cytosines) to create an individual C methylation output. Please respecify!\n";
    sleep(2);

    print_helpfile();
    exit;
  }

  my $coverage_infile = shift @ARGV;

  unless ($parent_dir){
    $parent_dir = getcwd();
  }
  unless ($parent_dir =~ /\/$/){
    $parent_dir =~ s/$/\//;
  }

  unless (defined $cytosine_out){
    die "Please provide the name of the output file using the option -o/--output filename\n";
  }

  ### OUTPUT DIR PATH
  if (defined $output_dir){
    unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it
      unless ($output_dir =~ /\/$/){
	$output_dir =~ s/$/\//;
      }
    }
  }
  else{
    $output_dir = '';
  }

  unless ($CX_context){
    $CX_context = 0;
    $CpG_only = 1;
  }

  ### GENOME folder
  if ($genome_folder){
    unless ($genome_folder =~/\/$/){
      $genome_folder =~ s/$/\//;
    }
  }
  else{
    die "Please specify a genome folder to proceed (full path only)\n";
  }

  if ($merge_CpGs){
    if ($CX_context){
      die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if CpG-context is selected only (lose the option --CX)\n";
    }
    if ($split_by_chromosome){
      die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if a single CpG report is written out (lose the option --split_by_chromosome)\n";
    }
  }

  return ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra);
}


sub generate_genome_wide_cytosine_report {

  warn  "="x78,"\n";
  warn "Methylation information will now be written into a genome-wide cytosine report\n";
  warn  "="x78,"\n\n";
  sleep (2);

  my $number_processed = 0;

  ### changing to the output directory again
  unless ($output_dir eq ''){ # default
    chdir $output_dir or die "Failed to change directory to $output_dir\n";
    # warn "Changed directory to $output_dir\n";
  }

  my $in = shift;
  # infiles handed over by the methylation extractor will be just the filename on their own. The directory should have been handed over with --dir
  if ($in =~ /gz$/){
      open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n"; # changed from gunzip -c to gunzip -c 08 04 16
  }
  else{
      open (IN,"$in") or die "Failed to read from file $in: $!\n";
  }


  ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands
  unless ($split_by_chromosome){ ### writing all output to a single file (default)
      if ($gzip){
	  unless ($cytosine_out =~ /\.gz$/){
	      $cytosine_out .= '.gz';
	  }
	  open (CYT,"| gzip -c - > $cytosine_out") or die "Failed to write to file $cytosine_out: $!\n";
      }
      else{
	  open (CYT,'>',$cytosine_out) or die "Failed to write to file $cytosine_out: $!\n";
      }
      warn ">>> Writing genome-wide cytosine report to: $cytosine_out <<<\n\n";
      sleep (1);
  }

  my $last_chr;
  my %chr; # storing reads for one chromosome at a time

  my $count = 0;
  while (<IN>){
      chomp;
      ++$count;
      my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/);

      # defining the first chromosome
      unless (defined $last_chr){
	  $last_chr = $chr;
	  ++$number_processed;
	  # warn "Storing all covered cytosine positions for chromosome: $chr\n";
      }

      ### As of version 0.9.1 the start positions are 1-based!
      if ($chr eq $last_chr){
	  $chr{$chr}->{$start}->{meth} = $meth;
	  $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
      }
      else{
	  warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
	  ++$number_processed;

	  if ($split_by_chromosome){ ## writing output to 1 file per chromosome
	      my $chromosome_out = $cytosine_out;
	      if ($chromosome_out =~ /\.txt$/){
		  $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/;
	      }
	      else{
		  $chromosome_out =~ s/$/.chr${last_chr}.txt/;
	      }
	      open (CYT,'>',$chromosome_out) or die $!;
	      # warn "Writing output for $last_chr to $chromosome_out\n";
	  }

	  my $tri_nt;
	  my $tetra_nt;
	  my $penta_nt;
	  my $context;

	  $processed{$last_chr} = 1;
	  while ( $chromosomes{$last_chr} =~ /([CG])/g){

	      $tri_nt = '';
	      $context = '';

	      if ($tetra){
		  $tetra_nt = ''; # clearing
		  $penta_nt = '';
	      }

	      my $pos = pos$chromosomes{$last_chr};

	      my $strand;
	      my $meth = 0;
	      my $nonmeth = 0;

	      if ($1 eq 'C'){    # C on forward strand
		  $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3);   # positions are 0-based!
		  if ($tetra){
		      if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){
			  $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4);
		      }
		      else{
			  $tetra_nt = '';
		      }
		      if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){
			  $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5);
		      }
		      else{
			  $penta_nt = '';
		      }
		  }
		  $strand = '+';
	      }
	      elsif ($1 eq 'G'){ # C on reverse strand

		  $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3);   # positions are 0-based!
		  $tri_nt = reverse $tri_nt;
		  $tri_nt =~ tr/ACTG/TGAC/;

		  if ($tetra){
		      if ( $pos - 4 >= 0 ){
			  $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4);
			  $tetra_nt = reverse $tetra_nt;
			  $tetra_nt =~ tr/ACTG/TGAC/;
		      }
		      else{
			  $tetra_nt = '';
		      }
		      if ( $pos - 5 >= 0 ){
			  $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5);
			  $penta_nt = reverse $penta_nt;
			  $penta_nt =~ tr/ACTG/TGAC/;
		      }
		      else{
			  $penta_nt = '';
		      }
		  }
		  $strand = '-';
	      }

	      next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted

	      # if (length$penta_nt < 5){
	      # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1);
	      # }

	      if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! (as of v0.9.1)
		  $meth =  $chr{$last_chr}->{$pos}->{meth};
		  $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth};
	      }


	      ### determining cytosine context
	      if ($tri_nt =~ /^CG/){
		  $context = 'CG';
	      }
	      elsif ($tri_nt =~ /^C.{1}G$/){
		  $context = 'CHG';
	      }
	      elsif ($tri_nt =~ /^C.{2}$/){
		  $context = 'CHH';
	      }
	      else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
		  warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n";
		  next;
	      }

	      if ($CpG_only){
		  if ($tri_nt =~ /^CG/){ # CpG context is the default
		      if ($zero){ # zero based coordinates
			  $pos -= 1;
			  if ($tetra){
			      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
			  }
			  else{
			      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
			  }
		      }
		      else{ # default
			  if ($tetra){
			      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
			  }
			  else{
			      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
			  }
		      }
		  }
	      }
	      else{ ## all cytosines, specified with --CX
		  if ($zero){ # zero based coordinates
		      $pos -= 1;
		      if ($tetra){
			  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
		      }
		      else{
			  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"
		      }
		  }
		  else{ # default
		      if ($tetra){
			  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
		      }
		      else{
			  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
		      }
		  }
	      }
	  }

	  %chr = (); # resetting the hash

	  # new first entry
	  $last_chr = $chr;
	  $chr{$chr}->{$start}->{meth} = $meth;
	  $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
      }
  }

  # Last found chromosome
  # If there never was a last chromosome then something must have gone wrong with reading the data in
  unless (defined $last_chr){
      die "No last chromosome was defined, something must have gone wrong while reading the data in (e.g. specified wrong file path for a gzipped coverage file?). Please check your command!\n\n";
  }

  warn "Writing cytosine report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
  $processed{$last_chr} = 1;

  if ($split_by_chromosome){ ## writing output to 1 file per chromosome
      my $chromosome_out = $cytosine_out;
      if ($chromosome_out =~ /\.txt$/){ # files passed on by the methylation extractor end in _report.txt
	  $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/;
      }
      else{ # user specified output file name
	  $chromosome_out =~ s/$/.chr${last_chr}.txt/;
      }
      open (CYT,'>',$chromosome_out) or die $!;
      # warn "Writing output for $last_chr to $chromosome_out\n";
  }

  my $tri_nt;
  my $tetra_nt;
  my $penta_nt;
  my $context;

  while ( $chromosomes{$last_chr} =~ /([CG])/g){

      $tri_nt = '';
      $context = '';

      if ($tetra){
	  $tetra_nt = '';
	  $penta_nt = '';
      }

      my $pos = pos$chromosomes{$last_chr};

      my $strand;
      my $meth = 0;
      my $nonmeth = 0;

      if ($1 eq 'C'){    # C on forward strand
	  $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3);   # positions are 0-based!
	  $strand = '+';

	  if ($tetra){
	      if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){
		  $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4);
	      }
	      else{
		  $tetra_nt = '';
	      }
	      if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){
		  $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5);
	      }
	      else{
		  $penta_nt = '';
	      }
	  }

      }
      elsif ($1 eq 'G'){ # C on reverse strand
	  $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3);   # positions are 0-based!
	  $tri_nt = reverse $tri_nt;
	  $tri_nt =~ tr/ACTG/TGAC/;
	  $strand = '-';

	  if ($tetra){
	      if ( $pos - 4 >= 0 ){
		  $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4);
		  $tetra_nt = reverse $tetra_nt;
		  $tetra_nt =~ tr/ACTG/TGAC/;
	      }
	      else{
		  $tetra_nt = '';
	      }

	      if ( $pos - 5 >= 0 ){
		  $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5);
		  $penta_nt = reverse $penta_nt;
		  $penta_nt =~ tr/ACTG/TGAC/;
	      }
	      else{
		  $penta_nt = '';
	      }
	  }
      }

      if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! as of v0.9.1
	  $meth =  $chr{$last_chr}->{$pos}->{meth};
	  $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth};
      }

      next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted
      # if (length$penta_nt < 5){
      # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1);
      # }

      ### determining cytosine context
      if ($tri_nt =~ /^CG/){
	  $context = 'CG';
      }
      elsif ($tri_nt =~ /^C.{1}G$/){
	  $context = 'CHG';
      }
      elsif ($tri_nt =~ /^C.{2}$/){
	  $context = 'CHH';
      }
      else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
	  warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n";
	  next;
      }

      if ($CpG_only){
	  if ($tri_nt =~ /^CG/){ # CpG context is the default
	      if ($zero){ # zero-based coordinates
		  $pos -= 1;
		  if ($tetra){
		      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
		  }
		  else{
		      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
		  }
	      }
	      else{ # default
		  if ($tetra){
		      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
		  }
		  else{
		      print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
		  }
	      }
	  }
      }
      else{ ## all cytosines, specified with --CX
	  if ($zero){ # zero based coordinates
	      $pos -= 1;
	      if ($tetra){
		  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
	      }
	      else{
		  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
	      }
	  }
	  else{ # default
	      if ($tetra){
		  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
	      }
	      else{
		  print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
	      }
	  }
      }
  }

  warn "Finished writing out cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n";

  ### Now processing chromosomes that were not covered in the coverage file
  warn "Now processing chromosomes that were not covered by any methylation calls in the coverage file...\n";
  my $unprocessed = 0;

  foreach my $chr (sort keys %processed) {
      unless ( $processed{$chr} ) {
	  ++$unprocessed;
	  ++$number_processed;
	  process_unprocessed_chromosomes($chr);
      }
  }

  if ($unprocessed == 0) {
    warn "All chromosomes in the genome were covered by at least some reads. coverage2cytosine processing complete.\n\n";
  }
  else{
    warn "Finished writing out cytosine report (processed $number_processed chromosomes/scaffolds in total). coverage2cytosine processing complete.\n\n";
  }

  close CYT or warn $!;

}


#### GC CONTEXT - optional
####

sub generate_GC_context_report {

  warn  "="x82,"\n";
  warn "Methylation information for GC context will now be written to a GpC-context report\n";
  warn  "="x82,"\n\n";
  sleep (2);

  my $number_processed = 0;

  ### changing to the output directory again
  unless ($output_dir eq ''){ # default
    chdir $output_dir or die "Failed to change directory to $output_dir\n";
    # warn "Changed directory to $output_dir\n";
  }

  my $in = shift;
  if ($in =~ /gz$/){
    open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n";
  }
  else{
    open (IN,"$in") or die "Failed to read from file $in: $!\n";
  }

  my $gc_out = $cytosine_out.'.GpC_report.txt';
  my $gc_cov = $cytosine_out.'.GpC.cov';

  ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands
  open (GC,'>',$gc_out) or die "Failed to write to GpC file $gc_out: !\n\n";
  warn ">>> Writing genome-wide GpC cytosine report to: $gc_out <<<\n";
  open (GCCOV,'>',$gc_cov) or die "Failed to write to GpC coverage file $gc_cov: !\n\n";
  warn ">>> Writing genome-wide GpC coverage file to: $gc_cov <<<\n\n";

  my $last_chr;
  my %chr; # storing reads for one chromosome at a time

  my $count = 0;
  while (<IN>){
    chomp;
    ++$count;
    my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/);

    # defining the first chromosome
    unless (defined $last_chr){
      $last_chr = $chr;
      ++$number_processed;
      # warn "Storing all covered cytosine positions for chromosome: $chr\n";
    }

    ### start positions are 1-based
    if ($chr eq $last_chr){
      $chr{$chr}->{$start}->{meth} = $meth;
      $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
    }
    else{
      warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
      ++$number_processed;
      $processed{$last_chr} = 1;
      while ( $chromosomes{$last_chr} =~ /(GC)/g){

	# a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time
	my $context_top = '';
	my $context_bottom = '';
	my $pos = pos$chromosomes{$last_chr};

	my $meth_top = 0;
	my $meth_bottom = 0;
	my $nonmeth_top = 0;
	my $nonmeth_bottom = 0;

	#warn "$1\n"; sleep(1);
	# C on forward strand
	my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3);   # positions are 0-based!
	my $strand_top = '+';

	# C on reverse strand
	my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3);   # positions are 0-based!
	$tri_nt_bottom = reverse $tri_nt_bottom;
	$tri_nt_bottom =~ tr/ACTG/TGAC/;
	my $strand_bottom = '-';

	next if (length $tri_nt_top < 3);     # trinucleotide sequence could not be extracted for the top strand
 	next if (length $tri_nt_bottom < 3);  # trinucleotide sequence could not be extracted for the reverse strand

	# top strand
	if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based!
	  $meth_top    = $chr{$last_chr}->{$pos}->{meth};
	  $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth};
	}
	# bottom strand
	if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs
	  $meth_bottom    = $chr{$last_chr}->{$pos-1}->{meth};
	  $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth};
	}

	### determining cytosine context top strand
	if ($tri_nt_top =~ /^CG/){
	  $context_top = 'CG';
	}
	elsif ($tri_nt_top =~ /^C.{1}G$/){
	  $context_top = 'CHG';
	}
	elsif ($tri_nt_top =~ /^C.{2}$/){
	  $context_top = 'CHH';
	}
	else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
	  warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n";
	  next;
	}

	### determining cytosine context bottom strand
	if ($tri_nt_bottom =~ /^CG/){
	  $context_bottom = 'CG';
	}
	elsif ($tri_nt_bottom =~ /^C.{1}G$/){
	  $context_bottom = 'CHG';
	}
	elsif ($tri_nt_bottom =~ /^C.{2}$/){
	  $context_bottom = 'CHH';
	}
	else{ # if the context can't be determined the positions will not be printed
	  warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n";
	  next;
	}

	# if Cs were not covered at all they are not written out
	unless ($meth_bottom + $nonmeth_bottom == 0){
	  my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100);
	  print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n";
	  print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n";
	}
	unless ($meth_top + $nonmeth_top == 0){
	  my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100);
	  print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n";
	  print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n";
	}

      }

      %chr = (); # resetting the hash

      # new first entry
      $last_chr = $chr;
      $chr{$chr}->{$start}->{meth} = $meth;
      $chr{$chr}->{$start}->{nonmeth} = $nonmeth;
    }
  }

  # Last found chromosome
  warn "Writing cytosine GpC report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n";
  $processed{$last_chr} = 1;
  while ( $chromosomes{$last_chr} =~ /(GC)/g){

    # a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time
    my $context_top = '';
    my $context_bottom = '';
    my $pos = pos$chromosomes{$last_chr};

    my $meth_top = 0;
    my $meth_bottom = 0;
    my $nonmeth_top = 0;
    my $nonmeth_bottom = 0;

    # C on forward strand
    my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3);   # positions are 0-based!
    my $strand_top = '+';

    # C on reverse strand
    my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3);   # positions are 0-based!
    $tri_nt_bottom = reverse $tri_nt_bottom;
    $tri_nt_bottom =~ tr/ACTG/TGAC/;
    my $strand_bottom = '-';

    next if (length $tri_nt_top < 3);     # trinucleotide sequence could not be extracted for the top strand
    next if (length $tri_nt_bottom < 3);  # trinucleotide sequence could not be extracted for the bottom strand

    ### determining cytosine context top strand
    if ($tri_nt_top =~ /^CG/){
      $context_top = 'CG';
    }
    elsif ($tri_nt_top =~ /^C.{1}G$/){
      $context_top = 'CHG';
    }
    elsif ($tri_nt_top =~ /^C.{2}$/){
      $context_top = 'CHH';
    }
    else{ # if the context can't be determined the positions will not be printed
      warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n";
      next;
    }

    ### determining cytosine context bottom strand
    if ($tri_nt_bottom =~ /^CG/){
      $context_bottom = 'CG';
    }
    elsif ($tri_nt_bottom =~ /^C.{1}G$/){
      $context_bottom = 'CHG';
    }
    elsif ($tri_nt_bottom =~ /^C.{2}$/){
      $context_bottom = 'CHH';
    }
    else{ # if the context can't be determined the positions will not be printed
      warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n";
      next;
    }

    # top strand
    if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based!
      $meth_top    = $chr{$last_chr}->{$pos}->{meth};
      $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth};
    }
    # bottom strand
    if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs
      $meth_bottom    = $chr{$last_chr}->{$pos-1}->{meth};
      $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth};
    }

    # if Cs were not covered at all they are not written out
    unless ($meth_bottom + $nonmeth_bottom == 0){
      my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100);
      print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n";
      print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n";
    }
    unless ($meth_top + $nonmeth_top == 0){
      my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100);
      print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n";
      print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n";
    }
  }

  warn "Finished writing out GpC cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n";

  close GC or warn $!;

}

####
####


sub process_unprocessed_chromosomes{

  my $chr = shift;

  warn "Writing cytosine report for not covered chromosome $chr\n";
  $processed{$chr} = 1;

  if ($split_by_chromosome){ ## writing output to 1 file per chromosome
    my $chromosome_out = $cytosine_out;
    if ($chromosome_out =~ /txt$/){ # files passed on by the methylation extractor end in _report.txt
	$chromosome_out =~ s/txt$/chr${chr}.txt/;
    }
    else{ # user specified output file name
	$chromosome_out =~ s/$/.chr${chr}.txt/;
    }

    open (CYT,'>',$chromosome_out) or die $!;
    # warn "Writing output for $last_chr to $chromosome_out\n";
  }

  my $tri_nt;
  my $tetra_nt;
  my $penta_nt;
  my $context;

  while ( $chromosomes{$chr} =~ /([CG])/g){

      $tri_nt = '';
      $context = '';
      if ($tetra){
	  $tetra_nt = ''; # clearing
	  $penta_nt = '';
      }

      my $pos = pos$chromosomes{$chr};

      my $strand;
      my $meth = 0;
      my $nonmeth = 0;

      if ($1 eq 'C'){    # C on forward strand
	  $tri_nt = substr ($chromosomes{$chr},($pos-1),3);   # positions are 0-based!
	  $strand = '+';
	  if ($tetra){
	      if ( length($chromosomes{$chr}) >= ($pos - 1 + 4) ){
		  $tetra_nt = substr ($chromosomes{$chr},($pos-1),4);
	      }
	      else{
		  $tetra_nt = '';
	      }
	      if ( length($chromosomes{$chr}) >= ($pos - 1 + 5) ){
		  $penta_nt = substr ($chromosomes{$chr},($pos-1),5);
	      }
	      else{
		  $penta_nt = '';
	      }
	  }
      }
      elsif ($1 eq 'G'){ # C on reverse strand
	  $tri_nt = substr ($chromosomes{$chr},($pos-3),3);   # positions are 0-based!
	  $tri_nt = reverse $tri_nt;
	  $tri_nt =~ tr/ACTG/TGAC/;
	  $strand = '-';

	  if ($tetra){
	      if ( $pos - 4 >= 0 ){
		  $tetra_nt = substr ($chromosomes{$chr},($pos-4),4);
		  $tetra_nt = reverse $tetra_nt;
		  $tetra_nt =~ tr/ACTG/TGAC/;
	      }
	      else{
		  $tetra_nt = '';
	      }
	      if ( $pos - 5 >= 0 ){
		  $penta_nt = substr ($chromosomes{$chr},($pos-5),5);
		  $penta_nt = reverse $penta_nt;
		  $penta_nt =~ tr/ACTG/TGAC/;
	      }
	      else{
		  $penta_nt = '';
	      }
	  }
	  $strand = '-';
      }

      next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted

      ### determining cytosine context
      if ($tri_nt =~ /^CG/){
	  $context = 'CG';
      }
      elsif ($tri_nt =~ /^C.{1}G$/){
	  $context = 'CHG';
      }
      elsif ($tri_nt =~ /^C.{2}$/){
	  $context = 'CHH';
      }
      else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark)
	  warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n";
	  next;
      }

      if ($CpG_only){
	  if ($tri_nt =~ /^CG/){ # CpG context is the default
	      if ($zero){ # zero-based coordinates
		  $pos -= 1;
		  if ($tetra){
		      print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
		  }
		  else{
		      print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
		  }
	      }
	      else{ # default
		  if ($tetra){
		      print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
		  }
		  else{
		      print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
		  }
	      }
	  }
      }
      else{ ## all cytosines, specified with --CX
	  if ($zero){ # zero based coordinates
	      $pos -= 1;
	      if ($tetra){
		  print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
	      }
	      else{
		  print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
	      }
	  }
	  else{ # default
	      if ($tetra){
		  print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n";
	      }
	      else{
		  print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n";
	      }
	  }
      }
  }

  #  close CYT or warn $!;

}


sub read_genome_into_memory{

  ## reading in and storing the specified genome in the %chromosomes hash
  chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
  warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";

  my @chromosome_filenames =  <*.fa>;

  ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
  unless (@chromosome_filenames){
    @chromosome_filenames =  <*.fasta>;
  }
  unless (@chromosome_filenames){
    die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
  }

  foreach my $chromosome_filename (@chromosome_filenames){

    # skipping the tophat entire mouse genome fasta file
    next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa');

    open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
    ### first line needs to be a fastA header
    my $first_line = <CHR_IN>;
    chomp $first_line;
    $first_line =~ s/\r//; # removing /r carriage returns

    ### Extracting chromosome name from the FastA header
    my $chromosome_name = extract_chromosome_name($first_line);

    my $sequence;
    while (<CHR_IN>){
      chomp;
      $_ =~ s/\r//; # removing /r carriage returns

      if ($_ =~ /^>/){
	### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
	if (exists $chromosomes{$chromosome_name}){
	  warn "chr $chromosome_name (",length $sequence ," bp)\n";
	  die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
	}
	else {
	  if (length($sequence) == 0){
	    warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
	  }
	  warn "chr $chromosome_name (",length $sequence ," bp)\n";
	  $chromosomes{$chromosome_name} = $sequence;
	  $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed
	}
	### resetting the sequence variable
	$sequence = '';
	### setting new chromosome name
	$chromosome_name = extract_chromosome_name($_);
      }
      else{
	$sequence .= uc$_;
      }
    }

    if (exists $chromosomes{$chromosome_name}){
      warn "chr $chromosome_name (",length $sequence ," bp)\t";
      die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
    }
    else{
      if (length($sequence) == 0){
	warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
      }
      warn "chr $chromosome_name (",length $sequence ," bp)\n";
      $chromosomes{$chromosome_name} = $sequence;
      $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed
    }
  }
  warn "\n";
  chdir $parent_dir or die "Failed to move to directory $parent_dir\n";
}

sub extract_chromosome_name {
  ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well
  my $fasta_header = shift;
  if ($fasta_header =~ s/^>//){
    my ($chromosome_name) = split (/\s+/,$fasta_header);
    return $chromosome_name;
  }
  else{
    die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
  }
}


sub print_helpfile{

  warn <<EOF

  SYNOPSIS:

  This script generates a cytosine methylation report for a genome of interest and a sorted methylation input file produced
  by the script "bismark2bedGraph". By default, the output uses 1-based chromosome coordinates and reports CpG positions only
  (for both strands individually and not merged in any way). Coordinates may be changed to 0-based using the option '--zero_based'.

  The input file needs to have been generated with the script bismark2bedGraph (the file is called *.cov) or otherwise be
  sorted by position and exactly in the following format:

  <chromosome>  <start position>  <end position>  <methylation percentage>  <count methylated>  <count unmethylated>

  The coordinates of the input file are expected to be 1-based throughout (do not use files ending in .zero.cov!).


  USAGE: coverage2cytosine [options] --genome_folder <path> -o <output> [input]


-o/--output <filename>   Name of the output file, mandatory.

--dir                    Output directory. Output is written to the current directory if not specified explicitly.

--genome_folder <path>   Enter the genome folder you wish to use to extract sequences from (full path only). Accepted
                         formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory.

-CX/--CX_context         The output file contains information on every single cytosine in the genome irrespective of
                         its context. This applies to both forward and reverse strands. Please be aware that this will
                         generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse.
                         Default: OFF (i.e. Default = CpG context only).

--merge_CpG              Using this option will post-process the genome-wide report to write out an additional coverage
                         file (see above for the coverage file format) which has the top and bottom strand methylation
                         evidence pooled into a single CpG dinucleotide entity. This may be the desirable input format
                         for some downstream processing tools such as the R-package bsseq (by K.D. Hansen). An example would be:

			   genome-wide CpG report (old)
			   gi|9626372|ref|NC_001422.1|     157     +       313     156     CG
			   gi|9626372|ref|NC_001422.1|     158     -       335     156     CG
			   merged CpG evidence coverage file (new)
			   gi|9626372|ref|NC_001422.1|     157     158     67.500000       648     312

			 This option is currently experimental, and only works if CpG context only and a single genome-wide report
                         were specified (i.e. it doesn't work with the options --CX or --split_by_chromosome).

--gc/--gc_context        In addition to normal processing this option will reprocess the genome to find methylation in
                         GpC context. This might be useful for specialist applications where GpC methylases had been
                         deployed. The output format is exactly the same as for the normal cytosine report, and only
                         positions covered by at least one read are reported (output file ends in .GpC_report.txt). In addition
                         this will write out a Bismark coverage file (ending in GpC.cov).

--ff                     In addition to the standard output selecting --ff will also extract a four and five (tetra/penta)
                         nucleotide context for the cytosines in question. Too short sequences (e.g. at the edges of the
                         chromosome) will be left blank; sequences containing Ns are ignored.

--zero_based             Uses 0-based coordinates instead of 1-based coordinates throughout. Default: OFF.

--split_by_chromosome    Writes the output into individual files for each chromosome instead of a single output file. Files
                         will be named to include the input filename and the chromosome number.

--gzip                   Output file will be GZIP compressed (ending in .gz). Only works for standard CpG- and CX-output.

--help                   Displays this help message and exits


OUTPUT FORMAT:

The genome-wide cytosine methylation output file is tab-delimited in the following format (1-based coords):
===========================================================================================================

<chromosome>  <position>  <strand>  <count methylated>  <count non-methylated>  <C-context>  <trinucleotide context>


                              Script last modified: 04 April 2016

EOF
    ;
  exit 1;
}
author	bgruening
date	Sat, 06 May 2017 13:18:09 -0400
parents
children