Mercurial > repos > bgruening > bismark

diff bismark @ 3:91f07ff056ca draft
Uploaded
author: bgruening
date: Mon, 14 Apr 2014 16:43:14 -0400
parents: 62c6da72dd4a
--- a/bismark	Wed Aug 21 05:19:54 2013 -0400
+++ b/bismark	Mon Apr 14 16:43:14 2014 -0400
@@ -24,7 +24,7 @@
 
 
 my $parent_dir = getcwd;
-my $bismark_version = 'v0.7.12';
+my $bismark_version = 'v0.10.0';
 my $command_line = join (" ",@ARGV);
 
 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
@@ -35,7 +35,7 @@
 }
 my @filenames;   # will be populated by processing the command line
 
-my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat) = process_command_line();
+my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag) = process_command_line();
 
 my @fhs;         # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
 my %chromosomes; # stores the chromosome sequences of the mouse genome
@@ -293,9 +293,13 @@
 
   ### printing all alignments to a results file
   my $outfile = $filename;
+  if ($prefix){
+    $outfile = "$prefix.$outfile";
+  }
+
 
   if ($bowtie2){ # SAM format is the default for Bowtie 2
-    $outfile =~ s/$/_bt2_bismark.sam/;
+    $outfile =~ s/$/_bismark_bt2.sam/;
   }
   elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
     $outfile =~ s/$/_bismark.txt/;
@@ -327,8 +331,12 @@
 
   ### printing alignment and methylation call summary to a report file
   my $reportfile = $filename;
+  if ($prefix){
+    $reportfile = "$prefix.$reportfile";
+  }
+
   if ($bowtie2){
-    $reportfile =~ s/$/_bt2_bismark_SE_report.txt/;
+    $reportfile =~ s/$/_bismark_bt2_SE_report.txt/;
   }
   else{
     $reportfile =~ s/$/_bismark_SE_report.txt/;
@@ -339,12 +347,19 @@
 
   if ($unmapped){
     my $unmapped_file = $filename;
+    if ($prefix){
+      $unmapped_file = "$prefix.$unmapped_file";
+    }
+
     $unmapped_file =~ s/$/_unmapped_reads.txt/;
     open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
     print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
   }
   if ($ambiguous){
     my $ambiguous_file = $filename;
+    if ($prefix){
+      $ambiguous_file = "$prefix.$ambiguous_file";
+    }
     $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
     open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
     print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
@@ -399,7 +414,12 @@
   }
 
   ### printing all alignments to a results file
-  my $outfile = $filename_1;
+  my $outfile = $filename_1; 
+
+  if ($prefix){
+    $outfile = "$prefix.$outfile";
+  }
+
   if ($bowtie2){ # SAM format is the default Bowtie 2 output
     $outfile =~ s/$/_bismark_bt2_pe.sam/;
   }
@@ -433,6 +453,10 @@
 
   ### printing alignment and methylation call summary to a report file
   my $reportfile = $filename_1;
+  if ($prefix){
+    $reportfile = "$prefix.$reportfile";
+  }
+
   if ($bowtie2){
     $reportfile =~ s/$/_bismark_bt2_PE_report.txt/;
   }
@@ -449,6 +473,10 @@
   if ($unmapped){
     my $unmapped_1 = $filename_1;
     my $unmapped_2 = $filename_2;
+    if ($prefix){
+      $unmapped_1 = "$prefix.$unmapped_1";
+      $unmapped_2 = "$prefix.$unmapped_2";
+    }
     $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
     $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
     open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
@@ -459,6 +487,11 @@
   if ($ambiguous){
     my $amb_1 = $filename_1;
     my $amb_2 = $filename_2;
+    if ($prefix){
+      $amb_1 = "$prefix.$amb_1";
+      $amb_2 = "$prefix.$amb_2";
+    }
+
     $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
     $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
     open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
@@ -578,19 +611,38 @@
   warn "Total number of C's analysed:\t$total_number_of_C\n\n";
   warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
   warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
-  warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
-  warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
-  warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
-  warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+  warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
+  if ($bowtie2){
+    warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
+  }
+  warn "\n";
+
+  warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+  warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+  warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
+  if ($bowtie2){
+    warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
+  }
+  warn "\n";
 
   print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
   print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
-  print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n";
+
+  print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
   print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
-  print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
-  print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
-  print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
-  print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+  print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
+  if ($bowtie2){
+    print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
+  }
+  print REPORT "\n";
+
+  print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+  print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+  print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
+  if ($bowtie2){
+    print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
+  }
+  print REPORT "\n";
 
   my $percent_meCHG;
   if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
@@ -607,6 +659,12 @@
     $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
   }
 
+  my $percent_meC_unknown;
+  if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
+    $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
+  }
+
+
   ### printing methylated CpG percentage if applicable
   if ($percent_meCpG){
     warn "C methylated in CpG context:\t${percent_meCpG}%\n";
@@ -629,20 +687,117 @@
 
   ### printing methylated C percentage (CHH context) if applicable
   if ($percent_meCHH){
-    warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
-    print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+    warn "C methylated in CHH context:\t${percent_meCHH}%\n";
+    print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
   }
   else{
-    warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
-    print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
-  }
+    warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
+    print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
+  }
+
+  ### printing methylated C percentage (Unknown C context) if applicable
+  if ($bowtie2){
+    if ($percent_meC_unknown){
+      warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
+      print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
+    }
+    else{
+      warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
+      print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
+    }
+  }
+  print REPORT "\n\n";
+  warn "\n\n";
 
   if ($seqID_contains_tabs){
     warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
     print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
   }
+
+
+  ###########################################################################################################################################
+  ### create pie-chart with mapping stats
+  ###########################################################################################################################################
+
+
+  my $filename;
+  if ($pbat){
+    $filename = $G_to_A_infile;
+  }
+  else{
+    $filename = $C_to_T_infile;
+  }
+
+  my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
+  $pie_chart =~ s/gz$//;
+  $pie_chart =~ s/_C_to_T\.fastq$//;
+  $pie_chart =~ s/_G_to_A\.fastq$//;
+
+  #  if ($prefix){
+  #    $pie_chart = "$prefix.$pie_chart"; # this is now being taken care of in file transformation
+  # }
+  $pie_chart = "${output_dir}${pie_chart}_bismark_SE.alignment_overview.png";
+
+
+  #Check whether the module GD::Graph is installed
+  my $gd_graph_installed = 0;
+  eval{
+    require GD::Graph::pie;
+    GD::Graph::pie->import();
+  };
+
+  unless($@) {
+    $gd_graph_installed = 1;
+  }
+  else{
+    warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n";
+    sleep(2);
+  }
+
+  if ($gd_graph_installed){
+    warn "Generating pie chart\n\n";
+    sleep(1);
+    my $graph = GD::Graph::pie->new(600,600);
+
+    my $percent_unaligned;
+    my $percent_multiple;
+    my $percent_unextractable;
+
+    if ($counting{sequences_count}){
+      $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count});
+      $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count});
+      $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count});
+    }
+    else{
+      $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A';
+    }
+
+    my @aln_stats = (
+		     ["Uniquely aligned $percent_alignable_sequences%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"],
+		     [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}],
+		    );
+
+    $graph->set( 
+		start_angle => 180,
+		'3d' => 0,
+		label => 'Alignment stats (single-end)',
+		suppress_angle => 2,    # Only label slices of sufficient size
+		transparent => 0,
+		dclrs => [ qw(red lorange dgreen cyan) ],
+	       ) or die $graph->error;
+
+    my $gd = $graph->plot(\@aln_stats) or die $graph->error;
+
+    open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n";
+    binmode PIE;
+    print PIE $gd->png;
+  }
+
+  warn "====================\nBismark run complete\n====================\n\n";
+
 }
 
+
 sub print_final_analysis_report_paired_ends{
   my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
   ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
@@ -736,18 +891,36 @@
   warn "Total number of C's analysed:\t$total_number_of_C\n\n";
   warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
   warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
-  warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
-  warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
-  warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
-  warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+  warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
+  if ($bowtie2){
+    warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
+  }
+  warn "\n";
+
+  warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+  warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+  warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
+  if ($bowtie2){
+    warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
+  }
+  warn "\n";
 
   print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
   print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
   print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
-  print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
-  print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
-  print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
-  print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
+  print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
+  if ($bowtie2){
+    print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n";
+  }
+  print REPORT "\n";
+
+  print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
+  print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
+  print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
+  if ($bowtie2){
+    print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n";
+  }
+  print REPORT "\n";
 
   my $percent_meCHG;
   if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
@@ -764,6 +937,12 @@
     $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
   }
 
+  my $percent_meC_unknown;
+  if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
+    $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
+  }
+
+
   ### printing methylated CpG percentage if applicable
   if ($percent_meCpG){
     warn "C methylated in CpG context:\t${percent_meCpG}%\n";
@@ -786,13 +965,112 @@
 
   ### printing methylated C percentage in CHH context if applicable
   if ($percent_meCHH){
-    warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
-    print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
+    warn "C methylated in CHH context:\t${percent_meCHH}%\n";
+    print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
+  }
+  else{
+    warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
+    print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
+  }
+
+  ### printing methylated C percentage (Unknown C context) if applicable
+  if ($bowtie2){
+    if ($percent_meC_unknown){
+      warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
+      print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
+    }
+    else{
+      warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
+      print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
+    }
+  }
+  print REPORT "\n\n";
+  warn "\n\n";
+
+
+  ############################################################################################################################################
+  ### create pie-chart with mapping stats
+  ###########################################################################################################################################
+
+  my $filename;
+  if ($pbat){
+    $filename = $G_to_A_infile_1;
   }
   else{
-    warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
-    print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
-  }
+    $filename = $C_to_T_infile_1;
+  }
+
+  my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
+  $pie_chart =~ s/gz$//;
+  $pie_chart =~ s/_C_to_T.fastq$//;
+  $pie_chart =~ s/_G_to_A.fastq$//;
+  ### special format for gzipped PE Bowtie1 files
+  $pie_chart =~ s/\.CT_plus_GA\.fastq\.$//;
+  $pie_chart =~ s/\.GA_plus_CT\.fastq\.$//;
+
+  if ($prefix){
+    # prefix is now being prepended to the temp files already
+    # $pie_chart = "$prefix.$pie_chart";
+  }
+  $pie_chart = "${output_dir}${pie_chart}_bismark_PE.alignment_overview.png";
+
+  #Check whether the module GD::Graph is installed
+  my $gd_graph_installed = 0;
+  eval{
+    require GD::Graph::pie;
+    GD::Graph::pie->import();
+  };
+
+  unless($@) {
+    $gd_graph_installed = 1;
+  }
+  else{
+    warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n";
+    sleep(2);
+  }
+
+  if ($gd_graph_installed){
+    warn "Generating pie chart\n\n";
+    sleep(1);
+    my $graph = GD::Graph::pie->new(600,600);
+
+    my $percent_unaligned;
+    my $percent_multiple;
+    my $percent_unextractable;
+
+    if ($counting{sequences_count}){
+      $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count});
+      $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count});
+      $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count});
+    }
+    else{
+      $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A';
+    }
+
+    my @aln_stats = (
+		     ["Uniquely aligned pairs $percent_alignable_sequence_pairs%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"],
+		     [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}],
+		    );
+
+    # push @{$mbias_read1[0]},$pos;
+
+    $graph->set( 
+		start_angle => 180,
+		'3d' => 0,
+		label => 'Alignment stats (paired-end)',
+		suppress_angle => 2,    # Only label slices of sufficient size
+		transparent => 0,
+		dclrs => [ qw(red lorange dgreen cyan) ],
+	       ) or die $graph->error;
+
+    my $gd = $graph->plot(\@aln_stats) or die $graph->error;
+
+    open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n";
+    binmode PIE;
+    print PIE $gd->png;
+  }
+
+  warn "====================\nBismark run complete\n====================\n\n";
 
 }
 
@@ -831,7 +1109,7 @@
     }
 
     $counting{sequences_count}++;
-    if ($counting{sequences_count}%100000==0) {
+    if ($counting{sequences_count}%1000000==0) {
       warn "Processed $counting{sequences_count} sequences so far\n";
     }
     chomp $sequence;
@@ -993,8 +1271,8 @@
     }
 
     $counting{sequences_count}++;
-    if ($counting{sequences_count}%100000==0) {
-      warn "Processed $counting{sequences_count} sequences so far\n";
+    if ($counting{sequences_count}%1000000==0) {
+      warn "Processed $counting{sequences_count} sequence pairs so far\n";
     }
     my $orig_identifier_1 = $identifier_1;
     my $orig_identifier_2 = $identifier_2;
@@ -1090,8 +1368,8 @@
     }
 
     $counting{sequences_count}++;
-    if ($counting{sequences_count}%100000==0) {
-      warn "Processed $counting{sequences_count} sequences so far\n";
+    if ($counting{sequences_count}%1000000==0) {
+      warn "Processed $counting{sequences_count} sequence pairs so far\n";
     }
 
     my $orig_identifier_1 = $identifier_1;
@@ -2778,12 +3056,12 @@
 
   ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
   if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
-    warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
+    warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n";
     $counting{genomic_sequence_could_not_be_extracted_count}++;
     return 0;
   }
   if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
-    warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
+    warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n";
     $counting{genomic_sequence_could_not_be_extracted_count}++;
     return 0;
   }
@@ -3077,7 +3355,8 @@
   my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
   my $flag_1 =  $methylation_call_params->{$sequence_identifier}->{flag_1};
   my $flag_2 =  $methylation_call_params->{$sequence_identifier}->{flag_2};
-#  print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
+  # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
+  # sleep(10);
   ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
   ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first)  position of the actually observed sequence
 
@@ -3111,7 +3390,7 @@
 
   my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
   my $indels_2 = 0;
-  
+
   ### Extracting read 1 genomic sequence ###
 
   # extracting 2 additional bp at the 5' end (read 1)
@@ -3160,6 +3439,7 @@
       $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
       return;
     }
+
     $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
   }
 
@@ -3209,6 +3489,10 @@
   if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
     ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
     unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
+      # need to set read 1 as well now to prevent warning
+      #  warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n";
+      #  sleep(5);
+      $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
       $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
       return;
     }
@@ -3809,6 +4093,8 @@
   ### h for not methylated C in CHH context (was converted)     ###
   ### Z for methylated C in CpG context (was protected)         ###
   ### z for not methylated C in CpG context (was converted)     ###
+  ### U for methylated C in unknown context (was protected)     ###
+  ### u for not methylated C in unknwon context (was converted) ###
   #################################################################
 
   my @match =();
@@ -3816,9 +4102,11 @@
   my $methyl_CHH_count = 0;
   my $methyl_CHG_count = 0;
   my $methyl_CpG_count = 0;
+  my $methyl_C_unknown_count = 0;
   my $unmethylated_CHH_count = 0;
   my $unmethylated_CHG_count = 0;
   my $unmethylated_CpG_count = 0;
+  my $unmethylated_C_unknown_count = 0;
 
   if ($read_conversion eq 'CT'){
     for my $index (0..$#seq) {
@@ -3832,7 +4120,10 @@
 	    ++$methyl_CpG_count;
 	    push @match,'Z'; # protected C, methylated, in CpG context
 	  }
-	
+	  elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
+	    ++$methyl_C_unknown_count;
+	    push @match,'U'; # protected C, methylated, in Unknown context
+	  }	
 	  else {
 	    ### C in not in CpG-context, determining the second downstream base context
 	    my $second_downstream_base = $genomic[$index+2];
@@ -3841,6 +4132,10 @@
 	      ++$methyl_CHG_count;
 	      push @match,'X'; # protected C, methylated, in CHG context
 	    }
+	    elsif ($second_downstream_base eq 'N'){
+	      ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
+	      push @match,'U'; # protected C, methylated, in Unknown context
+	    }
 	    else{
 	      ++$methyl_CHH_count;
 	      push @match,'H'; # protected C, methylated, in CHH context
@@ -3862,7 +4157,10 @@
 	    ++$unmethylated_CpG_count;
 	    push @match,'z'; # converted C, not methylated, in CpG context
 	  }
-
+	  elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
+	    ++$unmethylated_C_unknown_count;
+	    push @match,'u'; # converted C, not methylated, in Unknown context
+	  }
 	  else{
 	    ### C in not in CpG-context, determining the second downstream base context
 	    my $second_downstream_base = $genomic[$index+2];
@@ -3871,6 +4169,10 @@
 	      ++$unmethylated_CHG_count;
 	      push @match,'x'; # converted C, not methylated, in CHG context
 	    }
+	    elsif ($second_downstream_base eq 'N'){
+	      ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
+	      push @match,'u'; # converted C, not methylated, in Unknown context
+	    }
 	    else{
 	      ++$unmethylated_CHH_count;
 	      push @match,'h'; # converted C, not methylated, in CHH context
@@ -3903,7 +4205,10 @@
 	    ++$methyl_CpG_count;
 	    push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
 	  }
-
+	  elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
+	    ++$methyl_C_unknown_count;
+	    push @match,'U'; # protected C on opposing strand, methylated, in Unknown context
+	  }
 	  else{
 	    ### C in not in CpG-context, determining the second upstream base context
 	    my $second_upstream_base = $genomic[$index];
@@ -3912,6 +4217,10 @@
 	      ++$methyl_CHG_count;
 	      push @match,'X'; # protected C on opposing strand, methylated, in CHG context
 	    }
+	    elsif ($second_upstream_base eq 'N'){
+	      ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
+	      push @match,'U'; # protected C, methylated, in Unknown context
+	    }
 	    else{
 	      ++$methyl_CHH_count;
 	      push @match,'H'; # protected C on opposing strand, methylated, in CHH context
@@ -3935,7 +4244,10 @@
 	    ++$unmethylated_CpG_count;
 	    push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
 	  }
-
+	  elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
+	    ++$unmethylated_C_unknown_count;
+	    push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
+	  }
 	  else{
 	    ### C in not in CpG-context, determining the second upstream base context
 	    my $second_upstream_base = $genomic[$index];
@@ -3944,6 +4256,10 @@
 	      ++$unmethylated_CHG_count;
 	      push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
 	    }
+	    elsif ($second_upstream_base eq 'N'){
+	      ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
+	      push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
+	    }
 	    else{
 	      ++$unmethylated_CHH_count;
 	      push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
@@ -3969,9 +4285,11 @@
   $counting{total_meCHH_count} += $methyl_CHH_count;
   $counting{total_meCHG_count} += $methyl_CHG_count;
   $counting{total_meCpG_count} += $methyl_CpG_count;
+  $counting{total_meC_unknown_count} += $methyl_C_unknown_count;
   $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
   $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
   $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
+  $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count;
 
   # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
   return $methylation_call;
@@ -4106,6 +4424,13 @@
     $G_to_A_infile =~ s/$/_G_to_A.fa/;
   }
 
+  if ($prefix){
+    #  warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
+    $C_to_T_infile = "$prefix.$C_to_T_infile";
+    $G_to_A_infile = "$prefix.$G_to_A_infile";
+    #  warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
+  }
+
   warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
 
   if ($gzip){
@@ -4209,9 +4534,17 @@
   }
 
   my $C_to_T_infile = my $G_to_A_infile = $filename;
+
   $C_to_T_infile =~ s/$/_C_to_T.fa/;
   $G_to_A_infile =~ s/$/_G_to_A.fa/;
 
+  if ($prefix){
+    #  warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
+    $C_to_T_infile = "$prefix.$C_to_T_infile";
+    $G_to_A_infile = "$prefix.$G_to_A_infile";
+    #  warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
+  }
+
   if ($directional){
     if ($read_number == 1){
       warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
@@ -4354,6 +4687,13 @@
 
   my $C_to_T_infile = my $G_to_A_infile = $filename;
 
+  if ($prefix){
+    # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
+    $C_to_T_infile = "$prefix.$C_to_T_infile";
+    $G_to_A_infile = "$prefix.$G_to_A_infile";
+    # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
+  }
+
   if ($pbat){ # PBAT-Seq
     if ($gzip){
       $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
@@ -4515,6 +4855,13 @@
     $G_to_A_infile =~ s/$/_G_to_A.fastq/;
   }
 
+  if ($prefix){
+    #  warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
+    $C_to_T_infile = "$prefix.$C_to_T_infile";
+    $G_to_A_infile = "$prefix.$G_to_A_infile";
+    #  warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
+  }
+
   if ($directional){
     if ($read_number == 1){
       warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
@@ -4685,8 +5032,16 @@
 
   my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename;
 
+  if ($prefix){
+    # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n";
+    $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile";
+    $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile";
+    # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
+  }
+
   $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/;
   $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/;
+  # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
 
   warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n";
   open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n";
@@ -5557,9 +5912,11 @@
 	     total_meCHH_count => 0,
 	     total_meCHG_count => 0,
 	     total_meCpG_count => 0,
+	     total_meC_unknown_count => 0,
 	     total_unmethylated_CHH_count => 0,
 	     total_unmethylated_CHG_count => 0,
 	     total_unmethylated_CpG_count => 0,
+	     total_unmethylated_C_unknown_count => 0,
 	     sequences_count => 0,
 	     no_single_alignment_found => 0,
 	     unsuitable_sequence_count => 0,
@@ -5743,6 +6100,8 @@
   my $bam;
   my $gzip;
   my $pbat;
+  my $prefix;
+  my $old_flag;
 
   my $command_line = GetOptions ('help|man' => \$help,
 				 '1=s' => \$mates1,
@@ -5784,6 +6143,8 @@
 				 'bam' => \$bam,
 				 'gzip' => \$gzip,
 				 'pbat' => \$pbat,
+				 'prefix=s' => \$prefix,
+				 'old_flag' => \$old_flag,
 				);
 
 
@@ -5939,7 +6300,7 @@
     my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
     foreach my $file(@CT_bowtie_index){
       unless (-f $file){
-	die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n";
+	die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run the bismark_genome_preparation before running Bismark\n";
       }
     }
     ### checking the integrity of $GA_dir
@@ -5947,7 +6308,7 @@
     my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
     foreach my $file(@GA_bowtie_index){
       unless (-f $file){
-	die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
+	die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark\n";
       }
     }
   }
@@ -5958,7 +6319,7 @@
     my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
     foreach my $file(@CT_bowtie_index){
       unless (-f $file){
-	die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
+	die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n";
       }
     }
     ### checking the integrity of $GA_dir
@@ -5966,7 +6327,7 @@
     my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
     foreach my $file(@GA_bowtie_index){
       unless (-f $file){
-	die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
+	die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n";
       }
     }
   }
@@ -6247,6 +6608,11 @@
       push @bowtie_options,'--no-mixed';     ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
       push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
     }
+
+    if ($old_flag){
+      warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla);
+      sleep(3);
+    }
   }
   elsif ($mates2){
     die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
@@ -6400,7 +6766,18 @@
     }
   }
 
-  return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat);
+  ### PREFIX FOR OUTPUT FILES
+  if ($prefix){
+    # removing trailing dots
+
+    $prefix =~ s/\.+$//;
+
+    warn "Using the following prefix for output files: $prefix\n\n";
+    sleep(1);
+  }
+
+
+  return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag);
 }
 
 
@@ -6638,8 +7015,17 @@
   my $read_conversion_2       = $methylation_call_params->{$id}->{read_conversion_2};
   my $genome_conversion       = $methylation_call_params->{$id}->{genome_conversion};
 
-  my $id_1 = $id.'/1';
-  my $id_2 = $id.'/2';
+  my $id_1;
+  my $id_2;
+
+  if ($old_flag){
+    $id_1 = $id.'/1';
+    $id_2 = $id.'/2';
+  }
+  else{
+    $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard
+    $id_2 = $id;
+  }
 
   # Allows all degenerate nucleotide sequences in reference genome
   die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
@@ -6759,24 +7145,50 @@
   # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
   # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
 
-  my $flag_1;                                                          # FLAG variable used for SAM format
+  my $flag_1;                                                            # FLAG variable used for SAM format
   my $flag_2;
 
+  ### The new default FLAG values have been suggested by Peter Hickey, Australia (PH)
+
   if ($index == 0){       # OT
-    $flag_1 = 67;                                                      # Read 1 is on the + strand  (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
-    $flag_2 = 131;                                                     # Read 2 is on - strand but informative for the OT        (1+2+128)
+    unless ($old_flag){
+      $flag_1 = 99;                                                      # PH: Read 1 is on the + strand and Read 2 is reversed  (1+2+32+64)
+      $flag_2 = 147;                                                     # PH: Read 2 is on - strand but informative for the OT  (1+2+16+128)
+    }
+    else{
+      $flag_1 = 67;                                                      # Read 1 is on the + strand  (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
+      $flag_2 = 131;                                                     # Read 2 is on - strand but informative for the OT        (1+2+128)
+    }
   }
   elsif ($index == 1){    # CTOB
-    $flag_1 = 115;                                                     # Read 1 is on the + strand, we score for OB  (1+2+16+32+64)
-    $flag_2 = 179;                                                     # Read 2 is on the - strand  (1+2+16+32+128)
+    unless($old_flag){
+      $flag_1 = 83;                                                      # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed  (1+2+16+64)
+      $flag_2 = 163;                                                     # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed  (1+2+32+128)
+    }
+    else{
+      $flag_1 = 115;                                                     # Read 1 is on the + strand, we score for OB  (1+2+16+32+64)
+      $flag_2 = 179;                                                     # Read 2 is on the - strand  (1+2+16+32+128)
+    }
   }
   elsif ($index == 2){    # CTOT
-    $flag_1 = 67;                                                      # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
-    $flag_2 = 131;                                                     # Read 2 is on the + strand, score it for OT (1+2+128)
+    unless ($old_flag){
+      $flag_1 = 99;                                                      # PH: Read 1 is on the + strand and Read 2 is reversed  (1+2+32+64) 
+      $flag_2 = 147;                                                     # PH: Read 2 is on - strand but informative for the OT  (1+2+16+128) 
+    }
+    else{
+      $flag_1 = 67;                                                      # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
+      $flag_2 = 131;                                                     # Read 2 is on the + strand, score it for OT (1+2+128)
+    }
   }
   elsif ($index == 3){    # OB
-    $flag_1 = 115;                                                     # Read 1 is on the - strand, we score for OB  (1+2+16+32+64)
-    $flag_2 = 179;                                                     # Read 2 is on the + strand  (1+2+16+32+128)
+    unless ($old_flag){
+      $flag_1 = 83;                                                      # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed  (1+2+16+64)
+      $flag_2 = 163;                                                     # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed  (1+2+32+128)
+    }
+    else{
+      $flag_1 = 115;                                                     # Read 1 is on the - strand, we score for OB  (1+2+16+32+64)
+      $flag_2 = 179;                                                     # Read 2 is on the + strand  (1+2+16+32+128)
+    }
   }
 
   #####
@@ -6846,11 +7258,12 @@
 	# or
 	#
 	# ------------------------->     read 1
-	# <-----------                   read 2   read 2 contained within read 1
-
-	# start and end of read 2  are fully contained within read 1
-	$tlen_1 = 0;                                                       # Set as 0 when the information is unavailable
-	$tlen_2 = 0;                                                       # Set as 0 when the information is unavailable
+	# <------------------------      read 2   read 2 contained within read 1
+
+	# start and end of read 2  are fully contained within read 1, using the length of read 1 for the TLEN variable
+	$tlen_1 = $end_read_1 - $start_read_1 + 1;          # Set to length of read 1   Leftmost read has a + sign,
+	$tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1;   # Set to length of read 1   Rightmost read has a - sign. well this is debatable. Changed this
+                                                            ### as a request by frozenlyse on SeqAnswers on 24 July 2013
       }
 
     }
@@ -6882,12 +7295,13 @@
 	# or
 	#
 	# ------------------------->     read 2
-	# <-----------                   read 1   read 1 contained within read 2
+	#  <------------------------      read 1   read 1 contained within read 2
 	
-	# start and end of read 1  are fully contained within read 2
-	$tlen_1 = 0;                                                       # Set as 0 when the information is unavailable
-	$tlen_2 = 0;                                                       # Set as 0 when the information is unavailable
-      }
+	# start and end of read 1  are fully contained within read 2, using the length of read 2 for the TLEN variable
+	$tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1;          # Set to length of read 2   Shorter read receives a - sign,
+	$tlen_2 = $end_read_2 - $start_read_2 + 1;                 # Set to length of read 2   Longer read receives a +. Well this is debatable. Changed this
+	                                                           ### as a request by frozenlyse on SeqAnswers on 24 July 2013
+     }
     }
   }
 
@@ -7353,6 +7767,27 @@
 --samtools_path          The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
                          explicitly if Samtools is in the PATH already.
 
+--prefix <prefix>        Prefixes <prefix> to the output filenames. Trailing dots will be replaced by a single one. For
+                         example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc.
+
+--old_flag               Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition,
+                         this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since
+                         both the appended read IDs and custom FLAG values may cause problems with some downstream tools 
+                         such as Picard, new defaults were implemented as of version 0.8.3.
+
+
+                                             default                         old_flag
+                                       ===================              ===================
+                                       Read 1       Read 2              Read 1       Read 2
+
+                              OT:         99          147                  67          131
+
+                              OB:         83          163                 115          179
+
+                              CTOT:       99          147                  67          131
+
+                              CTOB:       83          163                 115          179
+
 
 
 Other:
@@ -7518,7 +7953,7 @@
 Each read of paired-end alignments is written out in a separate line in the above format.
 
 
-Last edited on 10 May 2013.
+Last edited on 07 October 2013.
 
 HOW_TO
 }
author	bgruening
date	Mon, 14 Apr 2014 16:43:14 -0400
parents	62c6da72dd4a
children