diff bismark_deduplicate/deduplicate_bismark @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bismark_deduplicate/deduplicate_bismark	Sat May 06 13:18:09 2017 -0400
@@ -0,0 +1,1272 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Getopt::Long;
+
+
+### This script is supposed to remove alignments to the same position in the genome which can arise by e.g. PCR amplification
+### Paired-end alignments are considered a duplicate if both partner reasd start and end at the exact same position
+
+### May 13, 2013
+### Changed the single-end trimming behavior so that only the start coordinate will be used. This avoids duplicate reads that have been trimmed to a varying extent
+### Changed the way of determining the end of reads in SAM format to using the CIGAR string if the read contains InDels
+
+### 16 July 2013
+### Adding a new deduplication mode for barcoded RRBS-Seq
+
+### 27 Sept 2013
+### Added close statement for all output filehandles (which should probably have been there from the start...)
+
+### 8 Jan 2015
+### to detect paired-end command from the @PG line we are no requiring spaces before and after the -1 or -2
+
+### 09 Mar 2015
+### Removing newline characters also from the read conversion flag in case the tags had been reordered and are now present in the very last column
+
+### 19 08 2015
+### Hiding the option --representative from view to discourage people from using it (it was nearly always not what they wanted to do anyway). It should still work 
+### for alignments that do not contain any indels
+### Just for reference, here is the the text:
+### print "--representative\twill browse through all sequences and print out the sequence with the most representative (as in most frequent) methylation call for any given position. Note that this is very likely the most highly amplified PCR product for a given sequence\n\n";
+
+
+my $dedup_version = 'v0.16.3';
+
+my $help;
+my $representative;
+my $single;
+my $paired;
+my $global_single;
+my $global_paired;
+my $vanilla;
+my $samtools_path;
+my $bam;
+my $rrbs;
+my $version;
+
+my $command_line = GetOptions ('help' => \$help,
+			       'representative' => \$representative,
+			       's|single' => \$global_single,
+			       'p|paired' => \$global_paired,
+			       'vanilla' => \$vanilla,
+			       'samtools_path=s' => \$samtools_path,
+			       'bam' => \$bam,
+			       'barcode' => \$rrbs,
+			       'version' => \$version,
+			      );
+
+die "Please respecify command line options\n\n" unless ($command_line);
+
+if ($help){
+  print_helpfile();
+  exit;
+}
+
+if ($version){
+  print << "VERSION";
+
+                           Bismark Deduplication Module
+
+                          Deduplicator Version: $dedup_version
+              Copyright 2010-16 Felix Krueger, Babraham Bioinformatics
+                www.bioinformatics.babraham.ac.uk/projects/bismark/
+
+
+VERSION
+    exit;
+  }
+
+
+
+my @filenames = @ARGV;
+
+unless (@filenames){
+  print "Please provide one or more Bismark output files for deduplication\n\n";
+  sleep (2);
+  print_helpfile();
+  exit;
+}
+
+
+### OPTIONS
+unless ($global_single or $global_paired){
+  if ($vanilla){
+    die "Please specify either -s (single-end) or -p (paired-end) for deduplication. Reading this information from the \@PG header line only works for SAM/BAM files\n\n";
+  }
+  warn "\nNeither -s (single-end) nor -p (paired-end) selected for deduplication. Trying to extract this information for each file separately from the \@PG line of the SAM/BAM file\n";
+}
+
+if ($global_paired){
+  if ($global_single){
+    die "Please select either -s for single-end files or -p for paired-end files, but not both at the same time!\n\n";
+  }
+  if ($vanilla){
+
+    if ($rrbs){
+      die "Barcode deduplication only works with Bismark SAM (or BAM) output (in attempt to phase out the vanilla format)\n";
+    }
+
+    warn "Processing paired-end custom Bismark output file(s):\n";
+    warn join ("\t",@filenames),"\n\n";
+  }
+  else{
+    warn "Processing paired-end Bismark output file(s) (SAM format):\n";
+    warn join ("\t",@filenames),"\n\n";
+  }
+}
+else{
+  if ($vanilla){
+    warn "Processing single-end custom Bismark output file(s):\n";
+    warn join ("\t",@filenames),"\n\n";
+  }
+  else{
+    warn "Processing single-end Bismark output file(s) (SAM format):\n";
+    warn join ("\t",@filenames),"\n\n";
+  }
+}
+
+### PATH TO SAMTOOLS
+if (defined $samtools_path){
+  # if Samtools was specified as full command
+  if ($samtools_path =~ /samtools$/){
+    if (-e $samtools_path){
+      # Samtools executable found
+    }
+    else{
+      die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
+    }
+  }
+  else{
+    unless ($samtools_path =~ /\/$/){
+      $samtools_path =~ s/$/\//;
+    }
+    $samtools_path .= 'samtools';
+    if (-e $samtools_path){
+      # Samtools executable found
+    }
+    else{
+      die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
+    }
+  }
+}
+# Check whether Samtools is in the PATH if no path was supplied by the user
+else{
+  if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
+    $samtools_path = `which samtools`;
+    chomp $samtools_path;
+  }
+}
+
+if ($bam){
+  if (defined $samtools_path){
+    $bam = 1;
+  }
+  else{
+    warn "No Samtools found on your system, writing out a gzipped SAM file instead\n";
+    $bam = 2;
+  }
+}
+else{
+  $bam = 0;
+}
+
+
+if ($representative){
+  warn "\nIf there are several alignments to a single position in the genome the alignment with the most representative methylation call will be chosen (this might be the most highly amplified PCR product...)\n\n";
+  sleep (1);
+}
+elsif($rrbs){
+  warn "\nIf the input is a multiplexed sample with several alignments to a single position in the genome, only alignments with a unique barcode will be chosen)\n\n";
+  sleep (1);
+}
+else{ # default; random (=first) alignment
+  warn "\nIf there are several alignments to a single position in the genome the first alignment will be chosen. Since the input files are not in any way sorted this is a near-enough random selection of reads.\n\n";
+  sleep (1);
+}
+
+foreach my $file (@filenames){
+
+  if ($global_single){
+    $paired = 0;
+    $single = 1;
+  }
+  elsif($global_paired){
+    $paired = 1;
+    $single = 0;
+  }
+
+  unless($global_single or $global_paired){
+
+    warn "Trying to determine the type of mapping from the SAM header line\n"; sleep(1);
+
+    ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file
+    if ($file =~ /\.gz$/){
+      open (DETERMINE,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
+    }
+    elsif ($file =~ /\.bam$/){
+      open (DETERMINE,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
+    }
+    else{
+      open (DETERMINE,$file) or die "Unable to read from $file: $!\n";
+    }
+    while (<DETERMINE>){
+      last unless (/^\@/);
+      if ($_ =~ /^\@PG/){
+	# warn "found the \@PG line:\n";
+	# warn "$_";
+	
+	if ($_ =~ /\s+-1\s+/ and $_ =~ /\s+-2\s+/){
+	  warn "Treating file as paired-end data (extracted from \@PG line)\n"; sleep(1);
+	  $paired = 1;
+	  $single = 0;
+	}
+	else{
+	  warn "Treating file as single-end data (extracted from \@PG line)\n"; sleep(1);
+	  $paired = 0;
+	  $single = 1;
+	}
+      }
+    }
+    close DETERMINE or warn "$!\n";
+  }
+
+  if ($file =~ /(\.bam$|\.sam$)/){
+      bam_isEmpty($file);
+  }
+  
+
+  ### OPTIONS
+  unless ($single or $paired){
+    die "Please specify either -s (single-end) or -p (paired-end) for deduplication, or provide a SAM/BAM file that contains the \@PG header line\n\n";
+  }
+
+  ### 
+  unless ($vanilla){
+    if ($paired){
+      test_positional_sorting($file);
+    }
+  }
+
+
+  ### writing to a report file
+  my $report = $file;
+
+  $report =~ s/\.gz$//;
+  $report =~ s/\.sam$//;
+  $report =~ s/\.bam$//;
+  $report =~ s/\.txt$//;
+  $report =~ s/$/.deduplication_report.txt/;
+
+  open (REPORT,'>',$report) or die "Failed to write to report file to $report: $!\n\n";
+
+
+  ### for representative methylation calls we need to discriminate between single-end and paired-end files as the latter have 2 methylation call strings
+  if($representative){
+    deduplicate_representative($file);
+  }
+
+  elsif($rrbs){
+    deduplicate_barcoded_rrbs($file);
+  }
+
+  ### as the default option we simply write out the first read for a position and discard all others. This is the fastest option
+  else{
+
+    my %unique_seqs;
+    my %positions;
+
+    if ($file =~ /\.gz$/){
+      open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
+    }
+    elsif ($file =~ /\.bam$/){
+      open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
+    }
+    else{
+      open (IN,$file) or die "Unable to read from $file: $!\n";
+    }
+
+    my $outfile = $file;
+    $outfile =~ s/\.gz$//;
+    $outfile =~ s/\.sam$//;
+    $outfile =~ s/\.bam$//;
+    $outfile =~ s/\.txt$//;
+
+    if ($vanilla){
+      $outfile =~ s/$/_deduplicated.txt/;
+    }
+    else{
+      if ($bam == 1){
+	$outfile =~ s/$/.deduplicated.bam/;
+      }
+      elsif ($bam == 2){
+	$outfile =~ s/$/.deduplicated.sam.gz/;
+      }
+      else{
+	$outfile =~ s/$/.deduplicated.sam/;
+      }
+    }
+    if ($bam == 1){
+      open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n";
+    }
+    elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
+      open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n";
+    }
+
+    else{
+      open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
+    }
+
+    ### need to proceed slightly differently for the custom Bismark and Bismark SAM output
+    if ($vanilla){
+      $_ = <IN>; # Bismark version header
+      print OUT; # Printing the Bismark version to the de-duplicated file again
+    }
+    my $count = 0;
+    my $unique_seqs = 0;
+    my $removed = 0;
+
+    while (<IN>){
+
+      if ($count == 0){
+	if ($_ =~ /^Bismark version:/){
+	  warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
+	  sleep (2);
+	  print_helpfile();
+	  exit;
+	}
+      }
+
+      ### if this was a SAM file we ignore header lines
+      unless ($vanilla){
+	if (/^\@\w{2}\t/){
+	  print "skipping header line:\t$_";
+	  print OUT "$_"; # Printing the header lines again into the de-duplicated file
+	  next;
+	}
+      }
+
+      ++$count;
+      my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths
+
+      my ($strand,$chr,$start,$end,$cigar);
+      my $line1;
+
+      if ($vanilla){
+	($strand,$chr,$start,$end) = (split (/\t/))[1,2,3,4];
+      }
+      else{ # SAM format
+	($strand,$chr,$start,$cigar) = (split (/\t/))[1,2,3,5]; # we are assigning the FLAG value to $strand
+
+	### SAM single-end
+	if ($single){
+
+	  if ($strand == 0 ){
+	    ### read aligned to the forward strand. No action needed
+	  }
+	  elsif ($strand == 16){
+	    ### read is on reverse strand
+	
+	    $start -= 1; # only need to adjust this once
+	
+	    # for InDel free matches we can simply use the M number in the CIGAR string
+	    if ($cigar =~ /^(\d+)M$/){ # linear match
+	      $start += $1;
+	    }
+
+	    else{
+	      # parsing CIGAR string
+	      my @len = split (/\D+/,$cigar); # storing the length per operation
+	      my @ops = split (/\d+/,$cigar); # storing the operation
+	      shift @ops; # remove the empty first element
+	      die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+
+	      # warn "CIGAR string; $cigar\n";
+	      ### determining end position of a read
+	      foreach my $index(0..$#len){
+		if ($ops[$index] eq 'M'){  # standard matching bases
+		  $start += $len[$index];
+		  # warn "Operation is 'M', adding $len[$index] bp\n";
+		}
+		elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
+		  # warn "Operation is 'I', next\n";
+		}
+		elsif($ops[$index] eq 'D'){ # deletions do affect the end position
+		  #  warn "Operation is 'D',adding $len[$index] bp\n";
+		  $start += $len[$index];
+		}
+		else{
+		  die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
+		}
+	      }
+	    }
+	  }
+	  $composite = join (":",$strand,$chr,$start);
+	}
+	elsif($paired){
+
+	  ### storing the current line
+	  $line1 = $_;
+
+	  my $read_conversion;
+	  my $genome_conversion;
+
+	  while ( /(XR|XG):Z:([^\t]+)/g ) {
+	    my $tag = $1;
+	    my $value = $2;
+
+	    if ($tag eq "XR") {
+	      $read_conversion = $value;
+	      $read_conversion =~ s/\r//;
+	      chomp $read_conversion;
+	    } elsif ($tag eq "XG") {
+	      $genome_conversion = $value;
+	      $genome_conversion =~ s/\r//;
+	      chomp $genome_conversion;
+	    }
+	  }
+	  die "Failed to determine read and genome conversion from line: $line1\n\n" unless ($read_conversion and $read_conversion);
+
+	
+	  my $index;
+	  if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
+	    $index = 0;
+	    $strand = '+';
+	  } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
+	    $index = 1;
+	    $strand = '-';
+	  } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
+	    $index = 2;
+	    $strand = '+';
+	  } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
+	    $index = 3;
+	    $strand = '-';
+	  } else {
+	    die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
+	  }
+	
+	  # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2	
+	  if ($index == 0 or $index == 2){
+	
+	    ### reading in the next line
+	    $_ = <IN>;
+	    # the only thing we need is the end position
+	    ($end,my $cigar_2) = (split (/\t/))[3,5];
+
+	    $end -= 1; # only need to adjust this once
+	
+	    # for InDel free matches we can simply use the M number in the CIGAR string
+	    if ($cigar_2 =~ /^(\d+)M$/){ # linear match
+	      $end += $1;
+	    }
+	    else{
+	      # parsing CIGAR string
+	      my @len = split (/\D+/,$cigar_2); # storing the length per operation
+	      my @ops = split (/\d+/,$cigar_2); # storing the operation
+	      shift @ops; # remove the empty first element
+	      die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops);
+	
+	      # warn "CIGAR string; $cigar_2\n";
+	      ### determining end position of the read
+	      foreach my $index(0..$#len){
+		if ($ops[$index] eq 'M'){  # standard matching bases
+		  $end += $len[$index];
+		  # warn "Operation is 'M', adding $len[$index] bp\n";
+		}
+		elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
+		  # warn "Operation is 'I', next\n";
+		}
+		elsif($ops[$index] eq 'D'){ # deletions do affect the end position
+		  #  warn "Operation is 'D',adding $len[$index] bp\n";
+		  $end += $len[$index];
+		}
+		else{
+		  die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
+		}
+	      }
+	    }
+	  }
+	  else{
+	    # else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line
+
+	    $end = $start - 1; # need to adjust this only once
+	
+	    # for InDel free matches we can simply use the M number in the CIGAR string
+	    if ($cigar =~ /^(\d+)M$/){ # linear match
+	      $end += $1;
+	    }
+	    else{
+	      # parsing CIGAR string
+	      my @len = split (/\D+/,$cigar); # storing the length per operation
+	      my @ops = split (/\d+/,$cigar); # storing the operation
+	      shift @ops; # remove the empty first element
+	      die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops);
+	
+	      # warn "CIGAR string; $cigar\n";
+	      ### determining end position of the read
+	      foreach my $index(0..$#len){
+		if ($ops[$index] eq 'M'){  # standard matching bases
+		  $end += $len[$index];
+		  # warn "Operation is 'M', adding $len[$index] bp\n";
+		}
+		elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
+		  # warn "Operation is 'I', next\n";
+		}
+		elsif($ops[$index] eq 'D'){ # deletions do affect the end position
+		  # warn "Operation is 'D',adding $len[$index] bp\n";
+		  $end += $len[$index];
+		}
+		else{
+		  die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
+		}
+	      }
+	    }
+	
+	    ### reading in the next line
+	    $_ = <IN>;
+	    # the only thing we need is the start position
+	    ($start) = (split (/\t/))[3];
+	  }
+	  $composite = join (":",$strand,$chr,$start,$end);
+	}
+	
+	else{
+	  die "Input must be single or paired-end\n";
+	}
+      }
+
+      if (exists $unique_seqs{$composite}){
+	++$removed;
+	unless (exists $positions{$composite}){
+	  $positions{$composite}++;
+	}
+      }
+      else{
+	if ($paired){
+	  unless ($vanilla){
+	    print OUT "$line1"; # printing first paired-end line for SAM output
+	  }
+	}
+	print OUT "$_"; # printing single-end SAM alignment or second paired-end line
+	$unique_seqs{$composite}++;
+      }
+    }
+
+    my $percentage;
+    my $percentage_leftover;
+    my $leftover = $count - $removed;
+
+    unless ($count == 0){
+      $percentage = sprintf("%.2f",$removed/$count*100);
+      $percentage_leftover = sprintf("%.2f",$leftover/$count*100);
+    }
+    else{
+      $percentage = 'N/A';
+      $percentage_leftover = 'N/A';
+    }
+
+    warn "\nTotal number of alignments analysed in $file:\t$count\n";
+    warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
+    warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
+    warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";
+
+    print REPORT "\nTotal number of alignments analysed in $file:\t$count\n";
+    print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
+    print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
+    print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";
+  }
+
+  close OUT or warn "Failed to close output filehandle: $!\n";
+  close REPORT or warn "Failed to close report filehandle: $!\n";
+
+}
+
+
+sub deduplicate_barcoded_rrbs{
+
+  my $file = shift;
+
+  my %unique_seqs;
+  my %positions;
+
+  if ($file =~ /\.gz$/){
+    open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
+  }
+  elsif ($file =~ /\.bam$/){
+    open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
+  }
+  else{
+    open (IN,$file) or die "Unable to read from $file: $!\n";
+  }
+
+  my $outfile = $file;
+  $outfile =~ s/\.gz$//;
+  $outfile =~ s/\.sam$//;
+  $outfile =~ s/\.bam$//;
+  $outfile =~ s/\.txt$//;
+
+  if ($vanilla){
+    $outfile =~ s/$/_dedup_RRBS.txt/;
+  }
+  else{
+    if ($bam == 1){
+      $outfile =~ s/$/.dedup_RRBS.bam/;
+    }
+    elsif ($bam == 2){
+      $outfile =~ s/$/.dedupRRBS.sam.gz/;
+    }
+    else{
+      $outfile =~ s/$/.dedup_RRBS.sam/;
+    }
+  }
+  if ($bam == 1){
+    open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n";
+  }
+  elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
+    open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n";
+  }
+  else{
+    open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
+  }
+
+  ### This mode only supports Bismark SAM output
+  my $count = 0;
+  my $unique_seqs = 0;
+  my $removed = 0;
+
+  while (<IN>){
+
+    if ($count == 0){
+      if ($_ =~ /^Bismark version:/){
+	warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
+	sleep (2);
+	print_helpfile();
+	exit;
+      }
+    }
+
+    ### if this was a SAM file we ignore header lines
+    if (/^\@\w{2}\t/){
+      warn "skipping SAM header line:\t$_";
+      print OUT; # Printing the header lines again into the de-duplicated file
+      next;
+    }
+
+    ++$count;
+    my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths
+    ### in this barcoded mode we also store the read barcode as additional means of assisting the deduplication
+    ### in effect the $composite string looks like this (separated by ':'):
+
+    ### FLAG:chromosome:start:barcode
+
+    my $end;
+    my $line1;
+
+    # SAM format
+    my ($id,$strand,$chr,$start,$cigar) = (split (/\t/))[0,1,2,3,5]; # we are assigning the FLAG value to $strand
+
+    $id =~ /:(\w+)$/;
+    my $barcode = $1;
+    unless ($barcode){
+      die "Failed to extract a barcode from the read ID (last element of each read ID needs to be the barcode sequence, e.g. ':CATG'\n\n";
+    }
+
+    ### SAM single-end
+    if ($single){
+
+      if ($strand == 0 ){
+	### read aligned to the forward strand. No action needed
+      }
+      elsif ($strand == 16){
+	### read is on reverse strand
+	
+	$start -= 1; # only need to adjust this once
+	
+	# for InDel free matches we can simply use the M number in the CIGAR string
+	if ($cigar =~ /^(\d+)M$/){ # linear match
+	  $start += $1;
+	}
+	else{
+	  # parsing CIGAR string
+	  my @len = split (/\D+/,$cigar); # storing the length per operation
+	  my @ops = split (/\d+/,$cigar); # storing the operation
+	  shift @ops; # remove the empty first element
+	  die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
+
+	  # warn "CIGAR string; $cigar\n";
+	  ### determining end position of a read
+	  foreach my $index(0..$#len){
+	    if ($ops[$index] eq 'M'){  # standard matching bases
+	      $start += $len[$index];
+	      # warn "Operation is 'M', adding $len[$index] bp\n";
+	    }
+	    elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
+	      # warn "Operation is 'I', next\n";
+	    }
+	    elsif($ops[$index] eq 'D'){ # deletions do affect the end position
+	      #  warn "Operation is 'D',adding $len[$index] bp\n";
+	      $start += $len[$index];
+	    }
+	    else{
+	      die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
+	    }
+	  }
+	}
+      }
+
+      ### Here we take the barcode sequence into consideration
+      $composite = join (":",$strand,$chr,$start,$barcode);
+      # warn "$composite\n\n";
+      # sleep(1);
+    }
+    elsif($paired){
+
+      ### storing the current line
+      $line1 = $_;
+
+      my $read_conversion;
+      my $genome_conversion;
+
+      while ( /(XR|XG):Z:([^\t]+)/g ) {
+	my $tag = $1;
+	my $value = $2;
+	
+	if ($tag eq "XR") {
+	  $read_conversion = $value;
+	  $read_conversion =~ s/\r//;
+	  chomp $read_conversion;
+	}
+	elsif ($tag eq "XG") {
+	  $genome_conversion = $value;
+	  $genome_conversion =~ s/\r//;
+	  chomp $genome_conversion;
+	}
+      }
+      die "Failed to determine read and genome conversion from line: $line1\n\n" unless ($read_conversion and $read_conversion);
+
+	
+      my $index;
+      if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand
+	$index = 0;
+	$strand = '+';
+      } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand
+	$index = 1;
+	$strand = '-';
+      } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand
+	$index = 2;
+	$strand = '+';
+      } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand
+	$index = 3;
+	$strand = '-';
+      } else {
+	die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n";
+      }
+	
+      # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2	
+      if ($index == 0 or $index == 2){
+	
+	### reading in the next line
+	$_ = <IN>;
+	# the only thing we need is the end position
+	($end,my $cigar_2) = (split (/\t/))[3,5];
+
+	$end -= 1; # only need to adjust this once
+	
+	# for InDel free matches we can simply use the M number in the CIGAR string
+	if ($cigar_2 =~ /^(\d+)M$/){ # linear match
+	  $end += $1;
+	}
+	else{
+	  # parsing CIGAR string
+	  my @len = split (/\D+/,$cigar_2); # storing the length per operation
+	  my @ops = split (/\d+/,$cigar_2); # storing the operation
+	  shift @ops; # remove the empty first element
+	  die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops);
+	
+	  # warn "CIGAR string; $cigar_2\n";
+	  ### determining end position of the read
+	  foreach my $index(0..$#len){
+	    if ($ops[$index] eq 'M'){  # standard matching bases
+	      $end += $len[$index];
+	      # warn "Operation is 'M', adding $len[$index] bp\n";
+	    }
+	    elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
+	      # warn "Operation is 'I', next\n";
+	    }
+	    elsif($ops[$index] eq 'D'){ # deletions do affect the end position
+	      #  warn "Operation is 'D',adding $len[$index] bp\n";
+	      $end += $len[$index];
+	    }
+	    else{
+	      die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
+	    }
+	  }
+	}
+      }
+      else{
+	# else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line
+	
+	$end = $start - 1; # need to adjust this only once
+	
+	# for InDel free matches we can simply use the M number in the CIGAR string
+	if ($cigar =~ /^(\d+)M$/){ # linear match
+	  $end += $1;
+	}
+	else{
+	  # parsing CIGAR string
+	  my @len = split (/\D+/,$cigar); # storing the length per operation
+	  my @ops = split (/\d+/,$cigar); # storing the operation
+	  shift @ops; # remove the empty first element
+	  die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops);
+	
+	  # warn "CIGAR string; $cigar\n";
+	  ### determining end position of the read
+	  foreach my $index(0..$#len){
+	    if ($ops[$index] eq 'M'){  # standard matching bases
+	      $end += $len[$index];
+	      # warn "Operation is 'M', adding $len[$index] bp\n";
+	    }
+	    elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
+	      # warn "Operation is 'I', next\n";
+	    }
+	    elsif($ops[$index] eq 'D'){ # deletions do affect the end position
+	      # warn "Operation is 'D',adding $len[$index] bp\n";
+	      $end += $len[$index];
+	    }
+	    else{
+	      die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
+	    }
+	  }
+	}
+	
+	### reading in the next line
+	$_ = <IN>;
+	# the only thing we need is the start position
+	($start) = (split (/\t/))[3];
+      }
+
+      ### Here we take the barcode sequence into consideration
+      $composite = join (":",$strand,$chr,$start,$end,$barcode);
+    }
+    else{
+      die "Input must be single or paired-end\n";
+    }
+
+    if (exists $unique_seqs{$composite}){
+      ++$removed;
+      unless (exists $positions{$composite}){
+	$positions{$composite}++;
+      }
+    }
+    else{
+      if ($paired){
+	print OUT $line1; # printing first paired-end line for SAM output
+      }
+      print OUT; # printing single-end SAM alignment or second paired-end line
+      $unique_seqs{$composite}++;
+    }
+  }
+
+  my $percentage;
+  my $percentage_leftover;
+  my $leftover = $count - $removed;
+
+  unless ($count == 0){
+    $percentage = sprintf("%.2f",$removed/$count*100);
+    $percentage_leftover = sprintf("%.2f",$leftover/$count*100);
+  }
+  else{
+    $percentage = 'N/A';
+    $percentage_leftover = 'N/A';
+  }
+
+
+  warn "\nTotal number of alignments analysed in $file:\t$count\n";
+  warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
+  warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
+  warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";
+
+  print REPORT "\nTotal number of alignments analysed in $file:\t$count\n";
+  print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
+  print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
+  print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";
+
+}
+
+sub bam_isEmpty{
+      
+    my $file = shift;
+
+    if ($file =~ /\.bam$/){
+	open (EMPTY,"$samtools_path view $file |") or die "Unable to read from BAM file $file: $!\n";
+    }
+    else{
+	open (EMPTY,$file) or die "Unable to read from $file: $!\n";
+    }
+    my $count = 0;
+    while (<EMPTY>){
+	if ($_){
+	    $count++;  # file contains data, fine.
+	}
+	last; # one line is enough
+    }
+
+    if ($count == 0){
+	die "\n### File appears to be empty, terminating deduplication process. Please make sure the input file has not been truncated. ###\n\n";
+    }
+    close EMPTY or warn "$!\n";
+}
+
+
+sub print_helpfile{
+  print "\n",'='x111,"\n";
+  print "\nThis script is supposed to remove alignments to the same position in the genome from the Bismark mapping output\n(both single and paired-end SAM files), which can arise by e.g. excessive PCR amplification. If sequences align\nto the same genomic position but on different strands they will be scored individually.\n\nNote that deduplication is not recommended for RRBS-type experiments!\n\nIn the default mode, the first alignment to a given position will be used irrespective of its methylation call\n(this is the fastest option, and as the alignments are not ordered in any way this is also near enough random).\n\n";
+  print "For single-end alignments only use the start coordinate of a read will be used for deduplication.\n\n";
+  print "For paired-end alignments the start-coordinate of the first read and the end coordinate of the second\nread will be used for deduplication. ";
+  print "This script expects the Bismark output to be in SAM format\n(Bismark v0.6.x or higher). To deduplicate the old custom Bismark output please specify '--vanilla'.\n\n";
+  print "*** Please note that for paired-end BAM files the deduplication script expects Read1 and Read2 to\nfollow each other in consecutive lines! If the file has been sorted by position make sure that you resort it\nby read name first (e.g. using samtools sort -n)  ***\n\n";
+
+  print '='x111,"\n\n";
+  print ">>> USAGE: ./deduplicate_bismark_alignment_output.pl [options] filename(s) <<<\n\n";
+
+  print "-s/--single\t\tdeduplicate single-end Bismark files (default format: SAM)\n";
+  print "-p/--paired\t\tdeduplicate paired-end Bismark files (default format: SAM)\n\n";
+  print "--vanilla\t\tThe input file is in the old custom Bismark format and not in SAM format\n\n";
+  print "--barcode\t\tIn addition to chromosome, start position and orientation this will also take a potential barcode into\n                        consideration while deduplicating. The barcode needs to be the last element of the read ID and separated\n                        by a ':', e.g.: MISEQ:14:000000000-A55D0:1:1101:18024:2858_1:N:0:CTCCT\n\n";
+  print "--bam\t\t\tThe output will be written out in BAM format instead of the default SAM format. This script will\n\t\t\tattempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't\n\t\t\tbeen specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,\n\t\t\tthe SAM output will be compressed with GZIP instead (yielding a .sam.gz output file)\n\n";
+  print "--samtools_path\t\tThe path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified\n\t\t\texplicitly if Samtools is in the PATH already\n\n";
+  print "--version\t\tPrint version information and exit\n";
+
+  print '='x111,"\n\n";
+
+  print "This script was last modified on November 04, 2015\n\n";
+}
+
+
+
+
+sub test_positional_sorting{
+
+  my $filename = shift;
+
+  print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t";
+  sleep(1);
+
+  if ($filename =~ /\.gz$/) {
+    open (TEST,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n";
+  }
+  elsif ($filename =~ /bam$/ ||  isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam
+    if ($samtools_path){
+      open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
+    }
+    else{
+      die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
+    }
+  }
+  else {
+    open (TEST,$filename) or die "Can't open file $filename: $!\n";
+  }
+
+  my $count = 0;
+
+  while (<TEST>) {
+    if (/^\@/) {	     # testing header lines if they contain the @SO flag (for being sorted)
+      if (/^\@SO/) {
+	die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n";
+      }
+      next;
+    }
+    $count++;
+
+    last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID
+
+    my ($id_1) = (split (/\t/));
+
+    ### reading the next line which should be read 2
+    $_ = <TEST>;
+    my ($id_2) = (split (/\t/));
+    last unless ($id_2);
+    ++$count;
+
+    if ($id_1 eq $id_2){
+      ### ids are the same
+      next;
+    }
+    else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first
+      my $id_1_trunc = $id_1;
+      $id_1_trunc =~ s/\/1$//;
+      my $id_2_trunc = $id_2;
+      $id_2_trunc =~ s/\/2$//;
+
+      unless ($id_1_trunc eq $id_2_trunc){
+	die "\nThe IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n";
+      }
+    }
+  }
+  #  close TEST or die $!; somehow fails on our cluster...
+  ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other)
+  warn "...passed!\n";
+  sleep(1);
+
+}
+
+sub isBam{
+
+  my $filename = shift;
+
+  # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy)
+  open (DISGUISE,"gunzip -c $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n";
+
+  ### when BAM files read through a gunzip -c stream they start with BAM...
+  my $bam_in_disguise = <DISGUISE>;
+  # warn "BAM in disguise: $bam_in_disguise\n\n";
+
+  if ($bam_in_disguise){
+    if ($bam_in_disguise =~ /^BAM/){
+      close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
+      return 1;
+    }
+    else{
+      close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
+     return 0;
+    }
+  }
+  else{
+    close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
+    return 0;
+  }
+}
+
+
+
+
+
+#####################################
+
+### The following subroutine "deduplicate_representative" only works correctly for reads that do not contain any indels (as in only Bowtie1-based alignments). 
+### Please refrain from using it unless you want to test something out.
+
+#####################################
+
+sub deduplicate_representative {
+  my $file = shift;
+
+  my %positions;
+  my %unique_seqs;
+
+  my $count = 0;
+  my $unique_seqs = 0;
+  my $removed = 0;
+
+  ### going through the file first and storing all positions as well as their methylation call strings in a hash
+  if ($file =~ /\.gz$/){
+    open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
+  }
+  elsif ($file =~ /\.bam$/){
+    open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
+  }
+  else{
+    open (IN,$file) or die "Unable to read from $file: $!\n";
+  }
+
+  if ($single){
+
+    my $outfile = $file;
+    $outfile =~ s/\.gz$//;
+    $outfile =~ s/\.sam$//;
+    $outfile =~ s/\.bam$//;
+    $outfile =~ s/\.txt$//;
+
+    if ($vanilla){
+      $outfile =~ s/$/.deduplicated_to_representative_sequences.txt/;
+    }
+    else{
+      if ($bam == 1){
+	$outfile =~ s/$/.deduplicated_to_representative_sequences.bam/;
+      }
+      elsif ($bam == 2){
+	$outfile =~ s/$/.deduplicated_to_representative_sequences.sam.gz/;
+      }
+      else{
+	$outfile =~ s/$/.deduplicated_to_representative_sequences.sam/;
+      }
+    }
+
+    if ($bam == 1){
+      open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n";
+    }
+    elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
+      open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n";
+    }
+    else{
+      open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
+    }
+
+    warn "Reading and storing all alignment positions\n";
+
+    ### need to proceed slightly differently for the custom Bismark and Bismark SAM output
+    if ($vanilla){
+      $_ = <IN>; # Bismark version header
+      print OUT; # Printing the Bismark version to the de-duplicated file again
+    }
+
+    while (<IN>){
+
+      if ($count == 0){
+	if ($_ =~ /^Bismark version:/){
+	  warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
+	  sleep (2);
+	  print_helpfile();
+	  exit;
+	}
+      }
+
+      ### if this was a SAM file we ignore header lines
+      unless ($vanilla){
+	if (/^\@\w{2}\t/){
+	  warn "skipping SAM header line:\t$_";
+	  print OUT; # Printing the header lines again into the de-duplicated file
+	  next;
+	}
+      }
+
+      my ($strand,$chr,$start,$end,$meth_call);
+
+      if ($vanilla){
+	($strand,$chr,$start,$end,$meth_call) = (split (/\t/))[1,2,3,4,7];
+      }
+      else{ # SAM format
+
+	($strand,$chr,$start,my $seq,$meth_call) = (split (/\t/))[1,2,3,9,13]; # we are assigning the FLAG value to $strand
+	### SAM single-end
+	$end = $start + length($seq) - 1;
+      }
+
+      my $composite = join (":",$strand,$chr,$start,$end);
+
+      $count++;
+      $positions{$composite}->{$meth_call}->{count}++;
+      $positions{$composite}->{$meth_call}->{alignment} = $_;
+    }
+    warn "Stored ",scalar keys %positions," different positions for $count sequences in total (+ and - alignments to the same position are scored individually)\n\n";
+    close IN or warn $!;
+  }
+
+  elsif ($paired){
+
+    ### we are going to concatenate both methylation call strings from the paired end file to form a joint methylation call string
+
+    my $outfile = $file;
+    if ($vanilla){
+      $outfile =~ s/$/_deduplicated_to_representative_sequences_pe.txt/;
+    }
+    else{
+      $outfile =~ s/$/_deduplicated_to_representative_sequences_pe.sam/;
+    }
+
+    open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
+    warn "Reading and storing all alignment positions\n";
+
+    ### need to proceed slightly differently for the custom Bismark and Bismark SAM output
+    if ($vanilla){
+      $_ = <IN>; # Bismark version header
+      print OUT; # Printing the Bismark version to the de-duplicated file again
+    }
+
+    while (<IN>){
+
+      if ($count == 0){
+	if ($_ =~ /^Bismark version:/){
+	  warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
+	  sleep (2);
+	  print_helpfile();
+	  exit;
+	}
+      }
+
+      ### if this was a SAM file we ignore header lines
+      unless ($vanilla){
+	if (/^\@\w{2}\t/){
+	  warn "skipping SAM header line:\t$_";
+	  print OUT; # Printing the header lines again into the de-duplicated file
+	  next;
+	}
+      }
+
+      my ($strand,$chr,$start,$end,$meth_call_1,$meth_call_2);
+      my $line1;
+
+      if ($vanilla){
+	($strand,$chr,$start,$end,$meth_call_1,$meth_call_2) = (split (/\t/))[1,2,3,4,7,10];
+      }
+      else{ # SAM paired-end format
+	
+	($strand,$chr,$start,$meth_call_1) = (split (/\t/))[1,2,3,13]; # we are assigning the FLAG value to $strand
+	
+	### storing the first line (= read 1 alignment)	
+	$line1 = $_;
+	
+	### reading in the next line
+	$_ = <IN>;
+	# we only need the end position and the methylation call
+	(my $pos,my $seq_2,$meth_call_2) = (split (/\t/))[3,9,13];
+	$end = $pos + length($seq_2) - 1;
+      }
+
+      my $composite = join (":",$strand,$chr,$start,$end);
+      $count++;
+      my $meth_call = $meth_call_1.$meth_call_2;
+
+      $positions{$composite}->{$meth_call}->{count}++;
+      if ($vanilla){
+	$positions{$composite}->{$meth_call}->{alignment} = $_;
+      }
+      else{ # SAM PAIRED-END
+	$positions{$composite}->{$meth_call}->{alignment_1} = $line1;
+	$positions{$composite}->{$meth_call}->{alignment_2} = $_;
+      }
+    }
+    warn "Stored ",scalar keys %positions," different positions for $count sequences in total (+ and - alignments to the same position are scored individually)\n\n";
+    close IN or warn $!;
+  }
+
+  ### PRINTING RESULTS
+
+  ### Now going through all stored positions and printing out the methylation call which is most representative, i.e. the one which occurred most often
+  warn "Now printing out alignments with the most representative methylation call(s)\n";
+
+  foreach my $pos (keys %positions){
+    foreach my $meth_call (sort { $positions{$pos}->{$b}->{count} <=> $positions{$pos}->{$a}->{count} }keys %{$positions{$pos}}){
+      if ($paired){
+	if ($vanilla){
+	  print OUT $positions{$pos}->{$meth_call}->{alignment};
+	}
+	else{
+	  print OUT $positions{$pos}->{$meth_call}->{alignment_1}; # SAM read 1
+	  print OUT $positions{$pos}->{$meth_call}->{alignment_2}; # SAM read 2
+	}
+      }
+      else{ # single-end
+	print OUT $positions{$pos}->{$meth_call}->{alignment};
+      }
+      $unique_seqs++;
+      last; ### exiting once we printed a sequence with the most frequent methylation call for a position
+    }
+  }
+
+  my $percentage;
+  unless ($count == 0){
+    $percentage = sprintf ("%.2f",$unique_seqs*100/$count);
+  }
+  else{
+    $percentage = 'N/A';
+  }
+
+  warn "\nTotal number of alignments analysed in $file:\t$count\n";
+  warn "Total number of representative alignments printed from $file in total:\t$unique_seqs ($percentage%)\n\n";
+  print REPORT "\nTotal number of alignments analysed in $file:\t$count\n";
+  print REPORT "Total number of representative alignments printed from $file in total:\t$unique_seqs ($percentage%)\n\n";
+
+}
+
+
+