bismark: bismark_methyl_extractor/bismark2bedGraph comparison

comparison bismark_methyl_extractor/bismark2bedGraph @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'

author	bgruening
date	Sat, 06 May 2017 13:18:09 -0400
parents
children

comparison

equal deleted inserted replaced

-:0f8646f22b8d
+:fcadce4d9a06
+#!/usr/bin/env perl
+use warnings;
+use strict;
+$|++;
+use Getopt::Long;
+use Cwd;
+use Carp;
+## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk)
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+## You should have received a copy of the GNU General Public License
+## along with this program. If not, see <http://www.gnu.org/licenses/>.
+my $bismark2bedGraph_version = 'v0.16.3';
+my @bedfiles;
+my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total
+my @sorting_files;
+my ($bedGraph_output,$parent_dir,$output_dir,$remove,$CX_context,$no_header,$sort_size,$coverage_threshold,$counts,$gazillion,$ample_mem,$zero,$input_dir) = process_commandline();
+warn "Using these input files: @sorting_files\n";
+warn "\nSummary of parameters for bismark2bedGraph conversion:\n";
+warn '='x54,"\n";
+warn "bedGraph output:\t\t$bedGraph_output\n";
+warn "output directory:\t\t>$output_dir<\n";
+if ($remove){
+warn "remove whitespaces:\t\tyes\n";
+}
+else{
+warn "remove whitespaces:\t\tno\n";
+}
+if ($CX_context){
+warn "CX context:\t\t\tyes\n";
+}
+else{
+warn "CX context:\t\t\tno (CpG context only, default)\n";
+}
+if ($no_header){
+warn "No-header selected:\t\tyes\n";
+}
+else{
+warn "No-header selected:\t\tno\n";
+}
+if ($ample_mem){
+warn "Sorting method:\t\t\tArray-based (faster, but larger memory footprint)\n";
+}
+else{
+warn "Sorting method:\t\t\tUnix sort-based (smaller memory footprint, but slower)\n";
+}
+unless($ample_mem){
+warn "Sort buffer size:\t\t$sort_size\n";
+}
+warn "Coverage threshold:\t\t$coverage_threshold\n";
+warn  "="x77,"\n";
+warn "Methylation information will now be written into a bedGraph and coverage file\n";
+warn  "="x77,"\n\n";
+sleep (2);
+### deciding which files to use for bedGraph conversion
+foreach my $filename (@sorting_files){
+### DETERMINING THE FULL PATH OF INPUT FILES
+if ($filename =~ /^(.*\/)(.*)$/){ # if files are in a different output folder we extract the filename again
+	# warn "folder name: $1\nfilename: $2\n\n";
+	chdir $1 or die "Failed to change directory to $1\n"; # $1 might be a relative path
+	$input_dir = getcwd();                                # this will always be the full path
+	$filename = $2;
+	# warn "Full Input folder: $input_dir\nFilename: $filename\n\n"; sleep (1);
+	chdir $parent_dir or die "Failed to move back to the parent directory\n\n"; # moving back
+}
+else{
+	$input_dir = $parent_dir;
+}
+$input_dir .= '/';
+if ($CX_context){
+	# push @bedfiles,$output_dir.$filename;
+	push @bedfiles,$input_dir.$filename;
+}
+else{ ## CpG context only (default)
+	if ($filename =~ /^CpG/){ # only testing the actual filename without the path information
+	    push @bedfiles,$input_dir.$filename; # we are adding the full path to the filename
+	}
+	else{
+	    # skipping CHH or CHG files
+	}
+}
+}
+if (@bedfiles){
+warn "Using the following files as Input:\n";
+print join ("\t",@bedfiles),"\n\n";
+sleep (2);
+}
+else{
+die "It seems that you are trying to generate bedGraph files for files not starting with CpG.... Please specify the option '--CX' and try again\n\n";
+}
+open (OUT,"| gzip -c - > ${output_dir}${bedGraph_output}") or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!\n";
+warn "Writing bedGraph to file: $bedGraph_output\n";
+print OUT "track type=bedGraph\n";
+my $coverage_output = $bedGraph_output;
+unless ($coverage_output =~ s/bedGraph\.gz$/bismark.cov.gz/){
+$coverage_output =~ s/$/.bismark.cov.gz/;
+}
+open (COVERAGE,"| gzip -c - > ${output_dir}${coverage_output}") or die "Problems writing to the coverage output detected. File path: '$output_dir'\tfile name: '$coverage_output' $!\n\n";
+warn "Also writing out a coverage file including counts methylated and unmethylated residues to file: $coverage_output\n";
+if ($zero){
+my $zero_coverage_output = $bedGraph_output;
+unless ($zero_coverage_output =~ s/bedGraph$/bismark.zero.cov/){
+$zero_coverage_output =~ s/$/.bismark.zero.cov/;
+}
+open (ZEROCOVERAGE,'>',$output_dir.$zero_coverage_output) or die "Problems writing to the zero-based coverage output detected. File path: '$output_dir'\tfile name: '$\zero_coverage_output' $!\n\n";
+warn "Also writing out a 0-based, half-open coverage file including counts methylated and unmethylated residues to file: $zero_coverage_output\n";
+}
+warn "\n";
+my %temp_fhs;
+my @temp_files; # writing all context files (default CpG only) to these files prior to sorting
+my %chr_lengths; # storing chromosome lenghts in '--ample_memory' mode
+### changing to the output directory
+unless ($output_dir eq ''){ # default
+chdir $output_dir or die "Failed to change directory to $output_dir\n";
+warn "Changed directory to $output_dir\n";
+}
+if ($gazillion){
+if (scalar @bedfiles == 1){
+	warn "The genome of interest was specified to contain gazillions of chromosomes or scaffolds. Sorting everything in memory instead of writing out individual chromosome files ...\n";
+}
+else{
+	warn "The genome of interest was specified to contain gazillions of chromosomes or scaffolds. Merging all input files and sorting everything in memory instead of writing out individual chromosome files...\n";
+	my $merge = "$bedGraph_output.methylation_calls.merged";
+	open (MERGE,'>',$merge) or die "Failed to write to temporary merged file $merge: $!\n";
+	warn "Writing all merged methylation calls to temp file $merge\n\n"; sleep(2);
+	push @temp_files, $merge;
+}
+}
+foreach my $infile (@bedfiles) {
+# warn "Processing $infile\n";
+if ($remove) {
+	warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n";
+	if ($infile =~ /gz$/){
+	    open (READ,"gunzip -c $infile |") or die $!;
+	}
+	else{
+	    open (READ,$infile) or die $!;
+	}
+	my $removed_spaces_outfile = $infile;
+	$removed_spaces_outfile =~ s/$/.spaces_removed.txt/;
+	$removed_spaces_outfile =~ s/.*\///;# replacing everything up to the last slash in the filename
+	warn "Attempting to write to file ${output_dir}${removed_spaces_outfile}\n\n";
+	open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n";
+	unless ($no_header){
+	    $_ = <READ>;		### Bismark version header
+	    print REM $_;		### Bismark version header
+	}
+	while (<READ>) {
+	    chomp;
+	    my ($id,$strand,$chr,$pos,$context) = (split (/\t/));
+	    $id =~ s/\s+/_/g;
+	    print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n";
+	}
+	close READ or die $!;
+	close REM or die $!;
+	### changing the infile name to the new file without spaces
+	$infile = $removed_spaces_outfile; # at this stage we are already in the output directory so it should pick it up correctly
+}
+# opening infile
+if ($infile =~ /gz$/){
+	open (IN,"gunzip -c $infile |") or die "Couldn't find file '$infile': $!\n";
+}
+else{
+	open (IN,$infile) or die "Couldn't find file '$infile': $!\n";
+}
+if ($infile =~ /\//){ # if input files are in a different folders we extract the filename again
+	$infile =~ s/.*\///;# replacing everything up to the last slash in the filename
+	# warn "Renamed Infile: $infile\n";
+}
+### People these days seem to be aligning their data to newly assembled genomes more and more, which sometimes conist of up to half a million scaffolds instead of ~23 chromosomes. This
+### does normally clash with the operating system's limit of files that can be open for writing at the same time, and it is difficult and probably not advisable to increase this
+### limit (some even say there is a reason for the OS doing so...).
+### To still allow then generation of bedGraph files we will in these cases sort everything using the Linux sort command instead, which will sort by chromosome and position (the
+### chromosome sorting is not carried out for chromosome sorted files which makes the sort MUCH faster).
+if ($gazillion){
+	# using all infiles instead of sorting
+	if (scalar @bedfiles == 1){
+	    push @temp_files, $infile;
+	}
+	else{
+	    ## always ignoring the version header
+	    unless ($no_header){
+		$_ = <IN>;		### Bismark version header
+	    }
+	    while (<IN>) {
+		if ($_ =~ /^Bismark /){
+		    warn "Found Bismark version information. Skipping this line (should still work fine) but consider using '--no_header' next time...\n";
+		    next;
+		}
+		print MERGE;
+	    }
+	    warn "Finished writing methylation calls from $infile to merged temp file\n";
+	}
+}
+else{
+	warn "Now writing methylation information for file >>$infile<< to individual files for each chromosome\n";
+	## always ignoring the version header
+	unless ($no_header){
+	    $_ = <IN>;		### Bismark version header
+	}
+	while (<IN>) {
+	    if ($_ =~ /^Bismark /){
+		warn "Found Bismark version information. Skipping this line (should still work fine) but consider losing '--no_header' next time...\n";
+		next;
+	    }
+	    chomp;
+	    my ($chr,$pos) = (split (/\t/))[2,3];
+	    ### If --ample_mem was specified we are keeping track of the highest position for each chromosome as this will determine the size of the array we need to create in the next step
+	    if ($ample_mem){
+		### setting the first position for this chromosome
+		unless (defined $chr_lengths{$chr} ){
+		    $chr_lengths{$chr} = $pos;
+		}
+		# for all subsequent postions for this chromosome
+		if ($pos > $chr_lengths{$chr} ){
+		    $chr_lengths{$chr} = $pos; # set the current position as the new highest position
+		}
+	    }
+	    # warn "This is the chromosome name before replacing '|' characters:\t$chr\n\n";
+	    $chr =~ s/\|/_/g; # replacing pipe ('|') characters in the file names
+	    # warn "This is the chromosome name AFTER replacing '|' characters:\t$chr\n\n";
+	    unless (exists $temp_fhs{$chr}) { # Including the infile name to the temporary chromosome files to enable parallel processing of multiple files at the same time
+		my $temp_file_name = $infile.'.chr'.$chr.'.methXtractor.temp';
+		# warn "Using temp file name: $temp_file_name\n"; sleep(1);
+		open ($temp_fhs{$chr},'>',$infile.'.chr'.$chr.'.methXtractor.temp') or die "Failed to open filehandle: $!";
+		push @temp_files, $temp_file_name; # storing temp files as we open them instead
+	    }
+	    print {$temp_fhs{$chr}} "$_\n";
+	}
+	warn "Finished writing out individual chromosome files for $infile\n";
+}
+}
+# closing temporary filehandles to force writing out buffered content
+foreach my $temp_fh(keys %temp_fhs){
+close $temp_fhs{$temp_fh} or warn "Failed to close temporary filehandle $temp_fhs{$temp_fh}: $!\n";
+}
+### printing out the determined maximum position for each chromosome
+if ($ample_mem){
+foreach my $chr (sort keys %chr_lengths){
+	warn "Highest determined position for chromosome $chr:\t\t$chr_lengths{$chr} bp\n";
+}
+warn "\n";
+}
+unless ($gazillion){
+warn "\n";
+warn "Collecting temporary chromosome file information... Processing the following input file(s):\n";
+warn join ("\n",@temp_files),"\n\n";
+sleep (1);
+}
+if ($gazillion){
+if (scalar @bedfiles > 1){
+	close (MERGE) or die "Failed to close filehandle MERGE: $!\n";
+}
+}
+foreach my $in (@temp_files) {
+if ($sort_size){
+	warn "Sorting input file $in by positions (using -S of $sort_size)\n" unless ($ample_mem);
+}
+my $ifh;
+my $name;
+my $meth_state;
+my $chr = "";
+my $pos = 0;
+my $meth_state2;
+my $last_pos;
+my $last_chr;
+### If the user specified to have a lot of RAM available (probably in the range of > 16GB for 2 arrays of human genome Chromosome 1) we will sort the methylation calls in two big arrays instead of using the Unix sort command
+if ($ample_mem){
+# warn "Generating enormous array instead of sorting the file. This may temporily use quite a bit of memory (RAM)!\n\n";
+my @meth_count;
+my @unmeth_count;
+open ($ifh,$in) or die "Couldn't read from temporary file '$in': $!\n";
+while (my $line = <$ifh>){
+next if ($line =~ /^Bismark/);
+chomp $line;
+($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
+unless ($last_pos and $last_chr){
+	$last_chr = $chr;
+	$last_pos = $pos;
+}
+unless (@meth_count and @unmeth_count){
+	warn  "Setting maximum position of arrays \@meth_count and \@unmeth_count for chromosome $chr to $chr_lengths{$chr}\n";
+	@meth_count   = (0) x  $chr_lengths{$chr};
+	@unmeth_count = (0) x  $chr_lengths{$chr};
+	# warn "length of array meth count: ",scalar @meth_count,"\n";
+	warn "Finished generating arrays\n";
+	# sleep(1);
+}
+# warn "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n"; sleep(1);
+# print join ("\t",$name, $meth_state, $chr, $pos, $meth_state2),"\n";
+# sleep(1);
+# if ($last_chr ne $chr) {
+#    die "Reached new chromosome '$chr' which mustn't happen from pre-sorted files (previous chromosome was: '$last_chr')\n";
+# }
+my $validated = validate_methylation_call($meth_state, $meth_state2); # as a comment, methylation calls in Unknown context (U, u) would fail this check, but they should be ignored by the methylation extractor anyway
+unless($validated){
+	warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
+	next;
+}
+if ($meth_state eq '+'){
+	# warn "increasing meth $pos by 1\n"; sleep(1);
+	$meth_count[$pos-1]++;
+}
+else{
+	$unmeth_count[$pos-1]++;
+	# warn "increasing unmeth $pos by 1\n"; sleep(1);
+}
+}
+close $ifh or die $!;
+warn "Now printing methylation information for this chromosome\n";
+# warn "length of array meth count: ",scalar @meth_count,"\n";
+# warn "chr\tposition\tcount methylated\tcount unmethylated\tcount total\n";
+foreach my $index (0..$#meth_count){
+my $totalcount = $meth_count[$index] + $unmeth_count[$index];
+if ($totalcount > 0){
+	#	warn "$index\t$meth_count[$index]\t$unmeth_count[$index]\t$totalcount\n";
+	# sleep(1);
+	my $bed_pos = $index; ### bedGraph coordinates are 0 based
+	my $one_based_pos = $bed_pos + 1;
+	my $meth_percentage;
+	($totalcount >= $coverage_threshold) ? ($meth_percentage = ($meth_count[$index]/$totalcount) * 100) : ($meth_percentage = undef);
+	if (defined $meth_percentage){
+	  # as of version 0.9.1 we will by default write out both a bedGraph and a more detailed coverage file
+	  # this is the bedGraph file, the starting position is 0-based, the end position is 1-based! (half-open. Clever, huh?)
+	  print OUT "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\n";
+	  # this is the coverage file. Coordinates are 1-based
+	  print COVERAGE "$last_chr\t$one_based_pos\t$one_based_pos\t$meth_percentage\t$meth_count[$index]\t$unmeth_count[$index]\n";
+	  # this is an optional 0-based, half-open coverage file. Coordinates are 0-based start and 1-based end
+	  if ($zero){
+	    print ZEROCOVERAGE "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\t$meth_count[$index]\t$unmeth_count[$index]\n";
+	  }
+	}
+}
+}
+@meth_count = ();
+@unmeth_count = ();
+}
+### default: we assume that the user wants to use the Linux Sort command. This is quite a bit slower, but features a much smaller memory footprint
+else{
+my $sort_dir = './'; # there has been a cd into the output_directory already
+# my $sort_dir = $output_dir;
+# if ($sort_dir eq ''){
+#   $sort_dir = './';
+# }
+if ($gazillion){
+	if ($in =~ /gz$/){
+	    open $ifh, "gunzip -c $in | sort -S $sort_size -T $sort_dir -k3,3V -k4,4n |" or die "Input file could not be sorted. $!\n";
+	}
+	else{
+	    open $ifh, "sort -S $sort_size -T $sort_dir -k3,3V -k4,4n $in |" or die "Input file could not be sorted. $!\n";
+	}
+	### Comment by Volker Brendel, Indiana University
+	### "The -k3,3V sort option is critical when the sequence names are numbered scaffolds (without left-buffering of zeros).  Omit the V, and things go very wrong in the tallying of reads."
+}
+else{
+	### this sort command was used previously and sorts according to chromosome in addition to position. Since the files are being sorted according to chromosomes anyway,
+	### we may drop the -k3,3V option. It has been reported that this will result in a dramatic speed increase
+	if ($in =~ /gz$/){
+	    open $ifh, "gunzip -c $in | sort -S $sort_size -T $sort_dir -k4,4n |" or die "Input file could not be sorted. $!\n";
+	}
+	else{
+	    open $ifh, "sort -S $sort_size -T $sort_dir -k4,4n $in |" or die "Input file could not be sorted. $!\n";
+	}
+}
+while (my $line = <$ifh>) {
+	next if ($line =~ /^Bismark/);
+	chomp $line;
+	#warn "full line:\n$line\n";
+	$last_chr = $chr;
+	$last_pos = $pos;
+	($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line;
+	#warn "$name, $meth_state, $chr, $pos, $meth_state2\n"; sleep(1);
+	if (($last_pos ne $pos) || ($last_chr ne $chr)) {
+	    generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
+	    @methylcalls = qw (0 0 0);
+	}
+my $validated = validate_methylation_call($meth_state, $meth_state2);
+unless($validated){
+	warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n";
+	next;
+}
+if ($meth_state eq "+") {
+	$methylcalls[0]++;
+	$methylcalls[2]++;
+} else {
+	  $methylcalls[1]++;
+	$methylcalls[2]++;
+}
+}
+$last_chr = $chr;
+$last_pos = $pos;
+if ($methylcalls[2] > 0) {
+generate_output($last_chr,$last_pos) if $methylcalls[2] > 0;
+}
+close $ifh or die $!;
+@methylcalls = qw (0 0 0); # resetting @methylcalls
+}
+### deleting temporary files (only needed if --gazillion hasn't been specified
+if ($gazillion and scalar @bedfiles == 1){
+# if there was only 1 file to sort this will be the input file, which obviously shouldn't be removed
+}
+else{
+my $delete = unlink $in;
+if ($delete) {
+warn "Successfully deleted the temporary input file $in\n\n";
+}
+else {
+warn "The temporary inputfile $in could not be deleted $!\n\n";
+}
+}
+}
+close OUT or die $!;
+close COVERAGE or die $!;
+if ($zero){
+close ZEROCOVERAGE or die $!;
+}
+exit 0;
+sub validate_methylation_call{
+my $meth_state = shift;
+croak "Missing (+/-) methylation call" unless defined $meth_state;
+my $meth_state2 = shift;
+croak "Missing alphabetical methylation call" unless defined $meth_state2;
+my $is_consistent;
+($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2))
+: ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2));
+return 1 if $is_consistent;
+return 0;
+}
+sub check_CpG_methylation_call{
+my $meth1 = shift;
+my $meth2 = shift;
+return 1 if($meth1 eq "+" && $meth2 eq "Z");
+return 1 if($meth1 eq "-" && $meth2 eq "z");
+return 0;
+}
+sub check_nonCpG_methylation_call{
+my $meth1 = shift;
+my $meth2 = shift;
+return 1 if($meth1 eq "+" && $meth2 eq "C");
+return 1 if($meth1 eq "+" && $meth2 eq "X");
+return 1 if($meth1 eq "+" && $meth2 eq "H");
+return 1 if($meth1 eq "-" && $meth2 eq "c");
+return 1 if($meth1 eq "-" && $meth2 eq "x");
+return 1 if($meth1 eq "-" && $meth2 eq "h");
+return 0;
+}
+sub generate_output{
+my $methcount = $methylcalls[0];
+my $nonmethcount = $methylcalls[1];
+my $totalcount = $methylcalls[2];
+my $last_chr = shift;
+my $last_pos = shift;
+croak "Should not be generating output if there's no reads to this region" unless ($totalcount > 0);
+croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if ($totalcount != ($methcount + $nonmethcount) );
+my $bed_pos = $last_pos - 1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based.
+my $meth_percentage;
+($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef);
+# $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/;
+if (defined $meth_percentage){
+# this is the bedGraph file, the starting position is 0-based, the end position is 1-based! (clever, huh?)
+my $one_based_pos = $bed_pos + 1;
+print OUT "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\n";
+# this is the coverage file. Coordinates are 1-based
+print COVERAGE "$last_chr\t$one_based_pos\t$one_based_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
+# this is an optional 0-based, half-open coverage file. Coordinates are 0-based start and 1-based end
+if ($zero){
+print ZEROCOVERAGE "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\t$methcount\t$nonmethcount\n";
+}
+}
+}
+sub process_commandline{
+my $help;
+my $output_dir;
+my $bedGraph_output;
+my $no_header;
+my $coverage_threshold; # Minimum number of reads covering before calling methylation status
+my $remove;
+my $counts;
+my $CX_context;
+my $sort_size;
+my $version;
+my $gazillion;
+my $ample_mem;
+my $zero;
+my $input_dir;
+my $command_line = GetOptions ('help|man'            => \$help,
+				 'dir=s'               => \$output_dir,
+				 'o|output=s'          => \$bedGraph_output,
+				 'no_header'           => \$no_header,
+				 "cutoff=i"            => \$coverage_threshold,
+				 "remove_spaces"       => \$remove,
+				 "counts"              => \$counts,
+				 "CX|CX_context"       => \$CX_context,
+				 "buffer_size=s"       => \$sort_size,
+				 'version'             => \$version,
+				 'gazillion|scaffolds' => \$gazillion,
+				 'ample_memory'        => \$ample_mem,
+				 "zero_based"          => \$zero,
+			);
+### EXIT ON ERROR if there were errors with any of the supplied options
+unless ($command_line){
+die "Please respecify command line options\n";
+}
+### HELPFILE
+if ($help){
+print_helpfile();
+exit;
+}
+if ($version){
+print << "VERSION";
+Bismark Methylation Extractor Module -
+bismark2bedGraph
+Bismark Extractor Version: $bismark2bedGraph_version
+Copyright 2010-15 Felix Krueger, Babraham Bioinformatics
+www.bioinformatics.babraham.ac.uk/projects/bismark/
+VERSION
+exit;
+}
+@sorting_files = @ARGV;
+### no files provided
+unless (@sorting_files){
+warn "You need to provide one or more Bismark methylation caller files to create an individual C methylation bedGraph output. Please respecify!\n\n";
+sleep(2);
+print_helpfile();
+exit;
+}
+### PARENT DIRECTORY
+my $parent_dir = getcwd();
+# warn "parent directory is: $parent_dir\n";
+### OUTPUT DIR PATH
+if (defined $output_dir){
+unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it
+	unless ($output_dir =~ /\/$/){
+	    $output_dir =~ s/$/\//;
+	}
+	unless (-d $output_dir){
+	    mkdir $output_dir or die "Failed to create output directory $output_dir: $!\n\n";
+	    warn "Created output directory $output_dir\n";
+	}
+### want to get an absolute path for the output directory instead of a relative one
+chdir $output_dir or die "Failed to move into output directory '$output_dir': $!\n\n";
+$output_dir = getcwd();
+unless ($output_dir =~ /\/$/){
+	$output_dir =~ s/$/\//;
+}
+# warn "output directory is: $output_dir\n";
+# changing back to the parent directory
+chdir $parent_dir or die "Failed to move back into parent directory '$parent_dir': $!\n\n";
+}
+}
+else{
+$output_dir = '';
+}
+unless (defined $bedGraph_output){
+die "Please provide the name of the output file using the option -o/--output filename\n";
+}
+if ($bedGraph_output =~ /\//){ # this is supposed a filename and not a path name
+die "Please specify a file name without any path information (or use --dir if necessary)\n\n";
+}
+unless ($bedGraph_output =~ /\.gz$/){
+$bedGraph_output = "${bedGraph_output}.gz";  ### 22 07 2015: Output will be gzip compressed
+}
+### NO HEADER
+unless ($no_header){
+$no_header = 0;
+}
+### remove white spaces in read ID (needed for sorting using the sort command
+unless ($remove){
+$remove = 0;
+}
+### COVERAGE THRESHOLD FOR bedGraph OUTPUT
+if (defined $coverage_threshold){
+unless ($coverage_threshold > 0){
+die "Please select a coverage greater than 0 (positive integers only)\n";
+}
+}
+else{
+$coverage_threshold = 1;
+}
+### SORT buffer size
+if (defined $sort_size){
+unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){
+die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n";
+}
+}
+else{
+$sort_size = '2G';
+}
+unless ($CX_context){
+$CX_context = 0;
+}
+unless ($counts){
+$counts = 1;
+}
+if ($gazillion){
+if ($ample_mem){
+die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n";
+}
+}
+return ($bedGraph_output,$parent_dir,$output_dir,$remove,$CX_context,$no_header,$sort_size,$coverage_threshold,$counts,$gazillion,$ample_mem,$zero,$input_dir);
+}
+sub print_helpfile{
+print <<EOF
+SYNOPSIS:
+This script uses positional methylation data generated by the Bismark methylation extractor to generate
+a bedGraph file as well as a coverage file which are both sorted by chromosomal position. The bedGraph
+file uses 0-based genomic start and 1-based genomic end coordinates and should be UCSC compatible (if
+UCSC genomes were used for the alignment step). In addition this module will write out a coverage file
+which is similar to the bedGraph file, but uses 1-based genomic coordinates and also reports the count
+of methylated and unmethylated cytosines for any covered position; this coverage file is required if you
+wish to generate a genome-wide cytosine report with the module coverage2cytosine.
+USAGE: bismark2bedGraph [options] -o <output> [methylation extractor input files]
+Methylation extractor input files: These files are required to start with CpG... in order for the
+script to correctly work out the sequence context when using CpG context only (default). If all cytosine
+contexts are selected ('--CX_context'), all input files will be used regardless of their file file name(s).
+-o/--output <filename>     Name of the output file, mandatory.
+--dir                      Output directory. Output is written to the current directory if not specified explicitly.
+--cutoff [threshold]       The minimum number of times a methylation state has to be seen for that nucleotide
+before its methylation percentage is reported. Default: 1.
+--remove_spaces            Replaces whitespaces in the sequence ID field with underscores to allow sorting.
+--CX/--CX_context          The sorted bedGraph output file contains information on every single cytosine that was covered
+in the experiment irrespective of its sequence context. This applies to both forward and
+reverse strands. Please be aware that this option may generate large temporary and output files
+and may take a long time to sort (up to many hours). Default: OFF.
+(i.e. Default = CpG context only).
+--buffer_size <string>     This allows you to specify the main memory sort buffer when sorting the methylation information.
+Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or
+			   a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc.
+(e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line.
+Defaults to 2G.
+--scaffolds/--gazillion    Users working with unfinished genomes sporting tens or even hundreds of thousands of
+scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to
+individual chromosome files. These errors were caused by the operating system's limit
+of the number of filehandle that can be written to at any one time (typically 1024; to
+find out this limit on Linux, type: ulimit -a).
+To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort
+methylation calls into individual chromosome files. Instead, all input files are
+temporarily merged into a single file (unless there is only a single file), and this
+file will then be sorted by both chromosome AND position using the Unix sort command.
+Please be aware that this option might take a looooong time to complete, depending on
+the size of the input files, and the memory you allocate to this process (see --buffer_size).
+Nevertheless, it seems to be working.
+--ample_memory             Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will
+instead use two arrays to sort methylated and unmethylated calls, respectively. This may result
+in a faster sorting process for very large files, but this comes at the cost of a larger memory
+footprint (as an estimate, two arrays of the length of (the largest) human chromosome 1 (nearly
+250 million bp) temporarily consume around 16GB of RAM). Note however that due to the overheads
+of creating and looping through arrays this option might in fact be *slower* for small-ish
+files (up to a few million alignments). Note also that this option is not currently compatible
+with options '--scaffolds/--gazillion'.
+--zero_based               Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start
+and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file,
+instead of using 1-based coordinates throughout. Default: OFF.
+The bedGraph output looks like this (tab-delimited; 0-based start coords, 1-based end coords):
+==============================================================================================
+track type=bedGraph (header line)
+<chromosome>  <start position>  <end position>  <methylation percentage>
+The coverage output looks like this (tab-delimited, 1-based genomic coords; optional zero-based, half-open coords with '--zero_based'):
+=======================================================================================================================================
+<chromosome>  <start position>  <end position>  <methylation percentage>  <count methylated>  <count non-methylated>
+Script last modified: 09 December 2015
+EOF
+;
+exit 1;
+}

Mercurial > repos > bgruening > bismark

comparison bismark_methyl_extractor/bismark2bedGraph @ 7:fcadce4d9a06 draft